diff --git a/CMakeLists.txt b/CMakeLists.txt
index f122dbb9cfc09..43bd4e0fcf86f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,6 +126,8 @@ if(WIN32)
         endforeach(flag_var)
     endif()
 
+    # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally
+    add_definitions("-DNOMINMAX")
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index d89ecd27c0954..c7a6f04b5f40a 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220104")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 06dcd9623376f..51d4fa42577a5 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -266,7 +266,7 @@ copy(inference_lib_dist
 # the header file of pten is copied to the experimental directory,
 # the include path of pten needs to be changed to adapt to inference api path
 add_custom_command(TARGET inference_lib_dist POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten.cmake"
+        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten_header.cmake"
         COMMENT "Change pten header include path to adapt to inference api path")
 
 # CAPI inference library for only inference
diff --git a/cmake/infrt_lib.cmake b/cmake/infrt_lib.cmake
index 73a8cdbee51c1..5b27c9d8400cc 100644
--- a/cmake/infrt_lib.cmake
+++ b/cmake/infrt_lib.cmake
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set(PADDLE_INFRT_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_infrt_install_dir" CACHE STRING
+set(INFRT_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_infrt_install_dir" CACHE STRING
   "A path setting paddle infrt shared and static libraries")
   
 function(copy TARGET)
@@ -52,18 +52,17 @@ add_custom_target(infrt_lib_dist DEPENDS ${infrt_lib_deps})
 # CMakeCache Info
 copy(infrt_lib_dist
         SRCS ${CMAKE_BINARY_DIR}/CMakeCache.txt
-        DSTS ${PADDLE_INFRT_INSTALL_DIR})
+        DSTS ${INFRT_INSTALL_DIR})
 
-set(src_dir "${PADDLE_SOURCE_DIR}/paddle/infrt")
-set(paddle_infrt_lib ${PADDLE_BINARY_DIR}/paddle/infrt/libinfrt.*)
+set(infrt_lib ${INFRT_BINARY_DIR}/libinfrt.*)
 copy(infrt_lib_dist
-    SRCS  ${src_dir}/api/infrt_api.h ${paddle_infrt_lib}
-    DSTS  ${PADDLE_INFRT_INSTALL_DIR}/infrt/include ${PADDLE_INFRT_INSTALL_DIR}/infrt/lib)
+    SRCS  ${INFRT_SOURCE_DIR}/api/infrt_api.h ${infrt_lib}
+    DSTS  ${INFRT_INSTALL_DIR}/infrt/include ${INFRT_INSTALL_DIR}/infrt/lib)
 
 
 copy(infrt_lib_dist
-        SRCS  ${CMAKE_BINARY_DIR}/paddle/infrt/paddle/framework.pb.h
-        DSTS  ${PADDLE_INFRT_INSTALL_DIR}/infrt/include/internal)
+        SRCS  ${INFRT_BINARY_DIR}/paddle/framework.pb.h
+        DSTS  ${INFRT_INSTALL_DIR}/infrt/include/internal)
 
 # paddle fluid version
 function(version version_file)
@@ -74,4 +73,4 @@ function(version version_file)
     file(WRITE ${version_file}  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n")
     file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
 endfunction()
-version(${PADDLE_INFRT_INSTALL_DIR}/version.txt)
+version(${INFRT_INSTALL_DIR}/version.txt)
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 673b33900d673..2d1ce4e834217 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -196,6 +196,8 @@ function(op_library TARGET)
         list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
         list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
         list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
+        list(REMOVE_ITEM hip_srcs "cholesky_solve_op.cu")
+        list(REMOVE_ITEM hip_srcs "lu_op.cu")
         list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
         list(REMOVE_ITEM hip_srcs "svd_op.cu")
         list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")
diff --git a/cmake/pten.cmake b/cmake/pten_header.cmake
similarity index 100%
rename from cmake/pten.cmake
rename to cmake/pten_header.cmake
diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake
new file mode 100644
index 0000000000000..947defcea4a61
--- /dev/null
+++ b/cmake/pten_kernel.cmake
@@ -0,0 +1,183 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# call kernel_declare need to make sure whether the target of input exists
+function(kernel_declare TARGET_LIST)
+    foreach(kernel_path ${TARGET_LIST})
+        file(READ ${kernel_path} kernel_impl)
+        # TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
+        # NOTE(chenweihang): now we don't recommend to use digit in kernel name
+        string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
+        if (NOT first_registry STREQUAL "")
+            # parse the first kernel name
+            string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
+            string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
+            string(REPLACE "," "" kernel_name "${kernel_name}")
+            string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
+            # append kernel declare into declarations.h
+            # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
+            if (${kernel_path} MATCHES "./cpu\/")
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./gpu\/")
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./xpu\/")
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
+            else ()
+                # deal with device independent kernel, now we use CPU temporaary
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+            endif()
+        endif()
+    endforeach()
+endfunction()
+
+function(kernel_library TARGET)
+    set(common_srcs)
+    set(cpu_srcs)
+    set(gpu_srcs)
+    set(xpu_srcs)
+    # parse and save the deps kerenl targets
+    set(all_srcs)
+    set(kernel_deps)
+
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
+        "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
+    # one kernel only match one impl file in each backend
+    if (${kernel_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+            list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+        endif()
+        if (WITH_GPU OR WITH_ROCM)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
+                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
+            endif()
+        endif()
+        if (WITH_XPU)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
+                list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
+            endif()
+        endif()
+    else()
+        # TODO(chenweihang): impl compile by source later
+    endif()
+
+    list(APPEND all_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.h)
+    list(APPEND all_srcs ${common_srcs})
+    list(APPEND all_srcs ${cpu_srcs})
+    list(APPEND all_srcs ${gpu_srcs})
+    list(APPEND all_srcs ${xpu_srcs})
+    foreach(src ${all_srcs})
+        file(READ ${src} target_content)
+        string(REGEX MATCHALL "#include \"paddle\/pten\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
+        foreach(include_kernel ${include_kernels})
+            string(REGEX REPLACE "#include \"paddle\/pten\/kernels\/" "" kernel_name ${include_kernel})
+            string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
+            list(APPEND kernel_deps ${kernel_name})
+        endforeach()
+    endforeach()
+    list(REMOVE_DUPLICATES kernel_deps)
+    list(REMOVE_ITEM kernel_deps ${TARGET})
+
+    list(LENGTH common_srcs common_srcs_len)
+    list(LENGTH cpu_srcs cpu_srcs_len)
+    list(LENGTH gpu_srcs gpu_srcs_len)
+    list(LENGTH xpu_srcs xpu_srcs_len)
+
+    if (${common_srcs_len} GREATER 0)
+        # If the kernel has a device independent public implementation,
+        # we will use this implementation and will not adopt the implementation
+        # under specific devices
+        if (WITH_GPU)
+            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_ROCM)
+            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        else()
+            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        endif()
+    else()
+        # If the kernel has a header file declaration, but no corresponding
+        # implementation can be found, this is not allowed
+        if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
+            ${xpu_srcs_len} EQUAL 0)
+            message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+        else()
+            if (WITH_GPU)
+                if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                    nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                endif()
+            elseif (WITH_ROCM)
+                if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                    hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                endif()
+            else()
+                if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+                    cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                endif()
+            endif()
+        endif()
+    endif()
+
+    if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
+        ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+        # append target into PTEN_KERNELS property
+        get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+        set(pten_kernels ${pten_kernels} ${TARGET})
+        set_property(GLOBAL PROPERTY PTEN_KERNELS ${pten_kernels})
+    endif()
+
+    # parse kernel name and auto generate kernel declaration
+    # here, we don't need to check WITH_XXX, because if not WITH_XXX, the
+    # xxx_srcs_len will be equal to 0
+    if (${common_srcs_len} GREATER 0)
+        kernel_declare(${common_srcs})
+    endif()
+    if (${cpu_srcs_len} GREATER 0)
+        kernel_declare(${cpu_srcs})
+    endif()
+    if (${gpu_srcs_len} GREATER 0)
+        kernel_declare(${gpu_srcs})
+    endif()
+    if (${xpu_srcs_len} GREATER 0)
+        kernel_declare(${xpu_srcs})
+    endif()
+endfunction()
+
+function(register_kernels)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs EXCLUDES DEPS)
+    cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}"
+        "${multiValueArgs}" ${ARGN})
+
+    file(GLOB KERNELS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_kernel.h")
+    string(REPLACE ".h" "" KERNELS "${KERNELS}")
+    list(LENGTH register_kernels_DEPS register_kernels_DEPS_len)
+
+    foreach(target ${KERNELS})
+        list(FIND register_kernels_EXCLUDES ${target} _index)
+        if (${_index} EQUAL -1)
+            if (${register_kernels_DEPS_len} GREATER 0)
+                kernel_library(${target} DEPS ${register_kernels_DEPS})
+            else()
+                kernel_library(${target})
+            endif()
+        endif()
+    endforeach()
+endfunction()
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 51f1d936bd70a..d8372e10888d9 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -5,18 +5,23 @@ endif()
 proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 
 if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
-  set(BRPC_DEPS brpc ssl crypto protobuf gflags glog zlib leveldb snappy gflags glog)
+  set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog)
 else()
   set(BRPC_DEPS "")
 endif()
 
+cc_library(task_loop_thread_pool SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc DEPS enforce glog)
+
 cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
-        interceptor.cc compute_interceptor.cc amplifier_interceptor.cc interceptor_message_service.cc message_bus.cc
-        DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper op_registry
-        executor_gc_helper ${BRPC_DEPS})
+        interceptor.cc compute_interceptor.cc amplifier_interceptor.cc message_service.cc message_bus.cc
+        DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool collective_helper
+        op_registry executor_gc_helper gflags glog ${BRPC_DEPS})
 
 if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  endif()
   set_source_files_properties(interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -24,8 +29,8 @@ if(WITH_DISTRIBUTE)
   set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(carrier.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(interceptor_message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(interceptor_message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(message_service.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(message_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
   add_subdirectory(test)
 endif()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 9d9755569b2fc..79ca6f467a38d 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
-#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -27,81 +27,57 @@ namespace distributed {
 USE_INTERCEPTOR(Compute);
 USE_INTERCEPTOR(Amplifier);
 
-void Carrier::Init(std::shared_ptr<RuntimeGraph> runtime_graph,
-                   framework::Scope* root_scope,
-                   framework::Scope* minibatch_scope,
-                   const std::vector<framework::Scope*>& microbatch_scopes,
-                   const platform::Place& place) {
-  PADDLE_ENFORCE_EQ(is_init_, false, platform::errors::AlreadyExists(
-                                         "Carrier is already init."));
-  runtime_graph_ = runtime_graph;
+void Carrier::Init(
+    int64_t rank,
+    const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank) {
+  rank_ = rank;
+  interceptor_id_to_rank_ = interceptor_id_to_rank;
+
+  // TODO(fleet_exe dev): thread pool
+  thread_num_ = 1;
+  thread_pool_.SetThreadNum(thread_num_);
+  thread_pool_.Start();
+}
+
+void Carrier::Init(
+    int64_t rank,
+    const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
+    const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
+    framework::Scope* root_scope, framework::Scope* minibatch_scope,
+    const std::vector<framework::Scope*>& microbatch_scopes,
+    const platform::Place& place) {
+  rank_ = rank;
+  interceptor_id_to_rank_ = interceptor_id_to_rank;
+  interceptor_id_to_node_ = interceptor_id_to_node;
   minibatch_scope_ = minibatch_scope;
   microbatch_scopes_ = microbatch_scopes;
   place_ = place;
   root_scope_ = root_scope;
   dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
+
+  // TODO(fleet_exe dev): thread pool
+  thread_num_ = 1;
+  thread_pool_.SetThreadNum(thread_num_);
+  thread_pool_.Start();
+
   CreateInterceptors();
   is_init_ = true;
 }
 
-void Carrier::Release() {
-  // NOTE(wangxi): must join before `Derived Interceptor` destruct,
-  // otherwise Derived object will be destructed before thread complete.
-
-  // Sending STOP msg to the source interceptor
-  PADDLE_ENFORCE_EQ(msg_bus_->IsInit(), true,
-                    platform::errors::PreconditionNotMet(
-                        "Using message bus since it has not been initialized. "
-                        "Please invoke MessageBus::Init() before using it or "
-                        "neccessary components are not ready."));
-  for (int64_t id : source_interceptor_ids_) {
-    VLOG(3) << "Carrier Release is sending stop to source interceptor " << id
-            << ".";
-    InterceptorMessage stop_msg;
-    // source node STOP is send by carrier, so set src_id=-1
-    stop_msg.set_src_id(-1);
-    stop_msg.set_dst_id(id);
-    stop_msg.set_message_type(STOP);
-    Send(stop_msg);
-  }
-
-  // TODO(wangxi): Maybe need a better to use thread.
-  for (auto& interceptor : interceptor_idx_to_interceptor_) {
-    interceptor.second->Join();
-  }
-}
+void Carrier::Release() {}
 
 Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
 
 bool Carrier::EnqueueInterceptorMessage(
     const InterceptorMessage& interceptor_message) {
-  // enqueue message to interceptor
-  if (interceptor_message.ctrl_message()) {
-    // handle control message
-    return true;
-  } else {
-    {
-      std::unique_lock<std::mutex> lock_creating(creating_flag_mutex_);
-      if (creating_interceptors_) {
-        std::unique_lock<std::mutex> lock_message(tmp_message_mutex_);
-        // Cannot handle the message to interceptor since interceptors
-        // are still under creating. Will enqueue into a tmp stack.
-        VLOG(3) << "Receiving message while creating interceptors.";
-        message_tmp_.emplace_back(interceptor_message);
-        return true;
-      }
-    }
-    int64_t dst_id = interceptor_message.dst_id();
-    Interceptor* dst_interceptor = GetInterceptor(dst_id);
-    bool rst =
-        dst_interceptor->EnqueueRemoteInterceptorMessage(interceptor_message);
-    if (rst) {
-      std::condition_variable& interceptor_cond_var =
-          dst_interceptor->GetCondVar();
-      interceptor_cond_var.notify_all();
-    }
-    return rst;
-  }
+  PADDLE_ENFORCE_EQ(
+      interceptor_message.ctrl_message(), false,
+      platform::errors::Fatal(
+          "Control message should be only send inter rank using message bus."));
+  int64_t dst_id = interceptor_message.dst_id();
+  Interceptor* dst_interceptor = GetInterceptor(dst_id);
+  dst_interceptor->EnqueueRemoteInterceptorMessage(interceptor_message);
+  return true;
 }
 
 Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) {
@@ -119,13 +95,14 @@ void Carrier::Wait() {
   cond_var_.wait(lock);
 }
 
-void Carrier::Start() {
-  PADDLE_ENFORCE_EQ(msg_bus_->IsInit(), true,
-                    platform::errors::PreconditionNotMet(
-                        "Using message bus since it has not been initialized. "
-                        "Please invoke MessageBus::Init() before using it or "
-                        "neccessary components are not ready."));
+void Carrier::WakeUp() {
+  // probably double notify, but ok for ut
+  cond_var_.notify_all();
+}
 
+void Carrier::Start() {
+  PADDLE_ENFORCE_EQ(is_init_, true, platform::errors::PreconditionNotMet(
+                                        "Using carrier before initialized."));
   for (int64_t id : source_interceptor_ids_) {
     VLOG(3) << "Carrier Start is sending start to source interceptor " << id
             << ".";
@@ -136,17 +113,42 @@ void Carrier::Start() {
     start_msg.set_message_type(DATA_IS_READY);
     Send(start_msg);
   }
+  // TODO(wangxi): async step
   Wait();
   dev_ctx_->Wait();
 }
 
-std::condition_variable& Carrier::GetCondVar() { return cond_var_; }
-
 bool Carrier::IsInit() const { return is_init_; }
 
-// TODO(liyurui): Move SendIntra into carrier
-bool Carrier::Send(const InterceptorMessage& msg) const {
-  return msg_bus_->Send(msg);
+int64_t Carrier::GetRank(int64_t interceptor_id) const {
+  PADDLE_ENFORCE_NE(
+      interceptor_id_to_rank_.find(interceptor_id),
+      interceptor_id_to_rank_.end(),
+      platform::errors::NotFound("Cannot find rank for interceptor id %lld.",
+                                 interceptor_id));
+  return interceptor_id_to_rank_.at(interceptor_id);
+}
+
+bool Carrier::Send(const InterceptorMessage& msg) {
+  int64_t src_id = (msg.src_id() == -1) ? msg.dst_id() : msg.src_id();
+  int64_t dst_id = msg.dst_id();
+  int64_t src_rank = GetRank(src_id);
+  int64_t dst_rank = GetRank(dst_id);
+  PADDLE_ENFORCE_EQ(
+      src_rank, rank_,
+      platform::errors::Fatal("The source rank id %lld, which is not equal to "
+                              "the carrier rank id %lld.",
+                              src_rank, rank_));
+  if (src_rank == dst_rank) {
+    VLOG(3) << "Send a message from interceptor " << src_id
+            << " to interceptor " << dst_id << ", which are in the same ranks.";
+    return EnqueueInterceptorMessage(msg);
+  } else {
+    VLOG(3) << "Send a message from interceptor " << src_id
+            << " to interceptor " << dst_id
+            << ", which are in different ranks.";
+    return GlobalVal<MessageBus>::Get()->Send(dst_rank, msg);
+  }
 }
 
 Interceptor* Carrier::SetInterceptor(int64_t interceptor_id,
@@ -158,51 +160,19 @@ Interceptor* Carrier::SetInterceptor(int64_t interceptor_id,
                         "The interceptor id should be unique.",
                         interceptor_id));
   interceptor->RegisterCarrier(this);
+
+  // TODO(fleet_exe dev): get loop
+  auto* loop = thread_pool_.GetLoop(interceptor_id % thread_num_);
+  PADDLE_ENFORCE_NOT_NULL(
+      loop, platform::errors::Fatal("thread task loop must not null"));
+  interceptor->RegisterTaskLoop(loop);
+
   auto* ptr = interceptor.get();
   interceptor_idx_to_interceptor_.insert(
       std::make_pair(interceptor_id, std::move(interceptor)));
   return ptr;
 }
 
-void Carrier::SetCreatingFlag(bool flag) {
-  // set the creating flag
-  creating_flag_mutex_.lock();
-  VLOG(3) << "Carrier is set the creating flag from " << creating_interceptors_
-          << " to " << flag << ".";
-  creating_interceptors_ = flag;
-  creating_flag_mutex_.unlock();
-  if (!flag) {
-    for (auto& pair : interceptor_idx_to_interceptor_) {
-      // update the source interceptor id
-      if (std::find(source_interceptor_ids_.begin(),
-                    source_interceptor_ids_.end(),
-                    pair.first) == source_interceptor_ids_.end()) {
-        auto task = pair.second->GetTaskNode();
-        if (task != nullptr && task->upstream().empty()) {
-          source_interceptor_ids_.emplace_back(pair.first);
-        }
-      }
-    }
-    // finish create interceptors outside, handle tmp messsages
-    HandleTmpMessages();
-  }
-}
-
-void Carrier::HandleTmpMessages() {
-  // NOTE: It's ok lock on the tmp_message_mutex_ here, when enter this
-  // `HandleTmpMessages` method, the creating_interceptors_ flag
-  // must be false, therefore, there won't have conflict with the
-  // lock on the tmp_message_mutex_ inside `EnqueueInterceptorMessage`
-  // on the same thread.
-  std::unique_lock<std::mutex> lock(tmp_message_mutex_);
-  VLOG(3) << "Carrier has received " << message_tmp_.size()
-          << " messages during creating interceptors.";
-  for (const auto& msg : message_tmp_) {
-    EnqueueInterceptorMessage(msg);
-  }
-  message_tmp_.clear();
-}
-
 static std::shared_ptr<framework::GarbageCollector> GetGC(
     const platform::Place& place) {
   int64_t max_memory_size = framework::GetEagerDeletionThreshold();
@@ -222,13 +192,13 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
 }
 
 void Carrier::CreateInterceptors() {
-  if (runtime_graph_->intercepter_id_to_node().empty()) return;
+  if (interceptor_id_to_node_.empty()) return;
 
   auto gc = GetGC(place_);
 
   // create each Interceptor
   // no auto init since there is no config
-  for (const auto& item : runtime_graph_->intercepter_id_to_node()) {
+  for (const auto& item : interceptor_id_to_node_) {
     int64_t interceptor_id = item.first;
     TaskNode* task_node = item.second;
 
@@ -260,12 +230,6 @@ void Carrier::CreateInterceptors() {
       source_interceptor_ids_.emplace_back(interceptor_id);
     }
   }
-  // The carrier will be always waiting for outside initializer
-  // since there is no interceptor has been created during auto init
-  creating_flag_mutex_.lock();
-  creating_interceptors_ = false;
-  creating_flag_mutex_.unlock();
-  HandleTmpMessages();
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index e850c120bdbe5..75ac07083a796 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -24,6 +24,7 @@
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
+#include "paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -42,17 +43,25 @@ class InterceptorMessageServiceImpl;
 class RuntimeGraph;
 class MessageBus;
 
+// TODO(liyurui): Add CarrierId instead of std::string
+
 class Carrier final {
  public:
-  Carrier() = default;
+  explicit Carrier(const std::string& carrier_id) : carrier_id_(carrier_id) {}
   ~Carrier();
-  void Init(std::shared_ptr<RuntimeGraph> runtime_graph,
-            framework::Scope* root_scope, framework::Scope* minibatch_scope,
-            const std::vector<framework::Scope*>& microbatch_scopes,
-            const platform::Place& place);
+  void Init(int64_t rank,
+            const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank);
+  void Init(
+      int64_t rank,
+      const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
+      const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
+      framework::Scope* root_scope, framework::Scope* minibatch_scope,
+      const std::vector<framework::Scope*>& microbatch_scopes,
+      const platform::Place& place);
 
   void Release();
   void Wait();
+  void WakeUp();
 
   // Enqueue a message to corresponding interceptor id
   bool EnqueueInterceptorMessage(const InterceptorMessage& interceptor_message);
@@ -64,31 +73,20 @@ class Carrier final {
   Interceptor* SetInterceptor(int64_t interceptor_id,
                               std::unique_ptr<Interceptor>);
 
-  void SetCreatingFlag(bool flag);
-  void SetMsgBus(const std::shared_ptr<MessageBus>& msg_bus) {
-    msg_bus_ = msg_bus;
-  }
-
-  std::condition_variable& GetCondVar();
-
   void Start();
 
   bool IsInit() const;
 
-  bool Send(const InterceptorMessage& msg) const;
-
-  // NOTE: This mutex will be used in interceptor's RunOps function.
-  // This mutex is used for avoiding forward ops and backward ops run
-  // simultaneously, which will lead to a random hang for some sync ops.
-  std::mutex run;
+  bool Send(const InterceptorMessage& msg);
 
  private:
   DISABLE_COPY_AND_ASSIGN(Carrier);
+  Carrier() = delete;
 
   // create each Interceptor
   void CreateInterceptors();
 
-  void HandleTmpMessages();
+  int64_t GetRank(int64_t interceptor_id) const;
 
   // interceptor logic id to actually interceptor
   std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
@@ -96,10 +94,6 @@ class Carrier final {
 
   std::vector<int64_t> source_interceptor_ids_;
 
-  std::vector<InterceptorMessage> message_tmp_{};
-  std::mutex tmp_message_mutex_;
-  bool creating_interceptors_{true};
-  std::mutex creating_flag_mutex_;
   bool is_init_{false};
 
   std::mutex running_mutex_;
@@ -109,9 +103,13 @@ class Carrier final {
   framework::Scope* minibatch_scope_;
   paddle::platform::Place place_;
   paddle::platform::DeviceContext* dev_ctx_{nullptr};
-  std::shared_ptr<RuntimeGraph> runtime_graph_;
-  std::shared_ptr<MessageBus> msg_bus_;
+  int64_t rank_;
+  std::string carrier_id_;
+  std::unordered_map<int64_t, TaskNode*> interceptor_id_to_node_;
   std::unordered_map<int64_t, int64_t> interceptor_id_to_rank_;
+  int thread_num_;
+  TaskLoopThreadPool thread_pool_;
+  std::unordered_set<int64_t> interceptor_ids_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 1f0d3408a3da8..d934ab1948e7e 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -170,7 +170,6 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }
 
 void ComputeInterceptor::RunOps() {
-  std::unique_lock<std::mutex> lock(carrier_->run);
   VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
           << step_ + 1 << " time.";
   for (auto op : node_->ops()) {
@@ -198,6 +197,7 @@ void ComputeInterceptor::Run() {
     if (is_last_ && (step_ % node_->max_run_times() == 0)) {
       VLOG(3) << "Interceptor " << GetInterceptorId()
               << " is stopping carrier.";
+      // FIXME(wangxi): with multi sink interceptor
       StopCarrier();
     }
   }
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index f7173a7b8bdfb..d6c1e678ad4f7 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
-#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -32,21 +32,22 @@ FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
   bool parse_flag = exe_desc_.ParseFromString(exe_desc_str);
   PADDLE_ENFORCE(parse_flag, platform::errors::PreconditionNotMet(
                                  "Error occurs while parsing string to proto"));
+  // Message bus will be created and inited only once
+  GlobalVal<MessageBus>::Create();
+  InitMessageBus();
 }
 
 FleetExecutor::~FleetExecutor() {
   root_scope_->DropKids();
-  GetCarrier().Release();
-}
-
-Carrier& FleetExecutor::GetCarrier() {
-  static Carrier carrier;
-  return carrier;
+  for (const auto& carrier_id : carrier_ids_) {
+    GlobalMap<std::string, Carrier>::Get(carrier_id)->Release();
+  }
 }
 
 void FleetExecutor::Init(
-    const framework::ProgramDesc& program_desc, framework::Scope* scope,
-    const platform::Place& place, const std::vector<TaskNode*>& task_nodes,
+    const std::string& carrier_id, const framework::ProgramDesc& program_desc,
+    framework::Scope* scope, const platform::Place& place,
+    const std::vector<TaskNode*>& task_nodes,
     const std::unordered_map<int64_t, int64_t>& task_id_to_rank) {
   PADDLE_ENFORCE_GT(task_nodes.size(), 0,
                     platform::errors::InvalidArgument(
@@ -83,18 +84,19 @@ void FleetExecutor::Init(
     CopyParameters(i, program_desc);
   }
   VLOG(5) << runtime_graph_->DebugString();
-  msg_bus_ = std::make_shared<MessageBus>();
-  InitCarrier();
-  InitMessageBus();
+  Carrier* carrier =
+      GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+  carrier_ids_.insert(carrier_id);
+  // Set current running carrier
+  GlobalVal<std::string>::Set(new std::string(carrier_id));
+  InitCarrier(carrier);
+  GlobalVal<MessageBus>::Get()->Barrier();
 }
 
-void FleetExecutor::InitCarrier() {
-  Carrier& carrier = GetCarrier();
-  if (!carrier.IsInit()) {
-    carrier.SetMsgBus(msg_bus_);
-    carrier.Init(runtime_graph_, root_scope_, minibatch_scope_,
-                 microbatch_scopes_, place_);
-  }
+void FleetExecutor::InitCarrier(Carrier* carrier) {
+  carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(),
+                runtime_graph_->interceptor_id_to_node(), root_scope_,
+                minibatch_scope_, microbatch_scopes_, place_);
 }
 
 void FleetExecutor::InitMessageBus() {
@@ -127,22 +129,17 @@ void FleetExecutor::InitMessageBus() {
   VLOG(3) << "The number of ranks are "
           << (rank_to_addr.size() == 0 ? 1 : rank_to_addr.size()) << ".";
   VLOG(5) << ss.str();
-  if (!msg_bus_->IsInit()) {
-    msg_bus_->Init(runtime_graph_->intercepter_id_to_rank(), rank_to_addr,
-                   addr);
-  }
+  GlobalVal<MessageBus>::Get()->Init(cur_rank, rank_to_addr, addr);
 }
 
-void FleetExecutor::Run() {
-  // Run
-  Carrier& carrier = GetCarrier();
-  PADDLE_ENFORCE_EQ(
-      carrier.IsInit(), true,
-      platform::errors::Unavailable("Carrier has not been init yet."));
-  PADDLE_ENFORCE_EQ(
-      msg_bus_->IsInit(), true,
-      platform::errors::Unavailable("MessageBus has not been init yet."));
-  carrier.Start();
+void FleetExecutor::Run(const std::string& carrier_id) {
+  Carrier* carrier = GlobalMap<std::string, Carrier>::Get(carrier_id);
+  // Set current running carrier
+  if (*GlobalVal<std::string>::Get() != carrier_id) {
+    GlobalVal<std::string>::Set(new std::string(carrier_id));
+    GlobalVal<MessageBus>::Get()->Barrier();
+  }
+  carrier->Start();
   for (auto* micro_scop : microbatch_scopes_) {
     // By default, we should delete all kid scopes after run executor because
     // some operators may create local scope when running, such as while_op.
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index a66288525c6f9..89ab4c62d386f 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
@@ -30,25 +31,23 @@ namespace distributed {
 class RuntimeGraph;
 class MessageBus;
 class TaskNode;
-class Carrier;
 
 class FleetExecutor final {
  public:
   FleetExecutor() = delete;
   explicit FleetExecutor(const std::string& exe_desc_str);
   ~FleetExecutor();
-  void Init(const framework::ProgramDesc& program_desc, framework::Scope* scope,
+  void Init(const std::string& carrier_id,
+            const framework::ProgramDesc& program_desc, framework::Scope* scope,
             const platform::Place& place,
             const std::vector<TaskNode*>& task_nodes,
             const std::unordered_map<int64_t, int64_t>& task_id_to_rank);
-  void Run();
-  // TODO(liyurui): Change to use registry table for multi-carrier.
-  static Carrier& GetCarrier();
+  void Run(const std::string& carrier_id);
 
  private:
   DISABLE_COPY_AND_ASSIGN(FleetExecutor);
   void InitMessageBus();
-  void InitCarrier();
+  void InitCarrier(Carrier* carrier);
   void CopyParameters(int microbatch_id, const framework::ProgramDesc& program);
   FleetExecutorDesc exe_desc_;
   std::shared_ptr<RuntimeGraph> runtime_graph_;
@@ -56,9 +55,7 @@ class FleetExecutor final {
   framework::Scope* minibatch_scope_;
   platform::Place place_;
   std::vector<framework::Scope*> microbatch_scopes_;
-  // The carriers under FleetExecutor will share message bus,
-  // using shared_ptr to manage lifetime and condition race.
-  std::shared_ptr<MessageBus> msg_bus_;
+  std::unordered_set<std::string> carrier_ids_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/global.h b/paddle/fluid/distributed/fleet_executor/global.h
new file mode 100644
index 0000000000000..776f314e6afb2
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/global.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+template <typename T>
+class GlobalVal final {
+ public:
+  static T* Get() {
+    T* ptr = GetPPtr()->get();
+    PADDLE_ENFORCE_NOT_NULL(
+        ptr, platform::errors::NotFound("This value is not global value."));
+    return ptr;
+  }
+  template <typename... Args>
+  static T* Create(Args&&... args) {
+    auto* ptr = GetPPtr();
+    PADDLE_ENFORCE_EQ(ptr->get(), nullptr,
+                      platform::errors::AlreadyExists(
+                          "This value is already a global value."));
+    T* item = new T(std::forward<Args>(args)...);
+    ptr->reset(item);
+    return item;
+  }
+
+  static T* Set(T* new_item) {
+    auto* ptr = GetPPtr();
+    ptr->reset(new_item);
+    return ptr->get();
+  }
+
+ private:
+  static std::unique_ptr<T>* GetPPtr() {
+    static std::unique_ptr<T> ptr;
+    return &ptr;
+  }
+};
+
+template <typename KeyT, typename ValueT>
+class GlobalMap final {
+ public:
+  static ValueT* Get(KeyT id) {
+    ValueT* item = GetPPtr(id)->get();
+    PADDLE_ENFORCE_NOT_NULL(
+        item, platform::errors::NotFound("This value is not in global map."));
+    return item;
+  }
+
+  template <typename... Args>
+  static ValueT* Create(KeyT id, Args&&... args) {
+    auto* ptr = GetPPtr(id);
+    PADDLE_ENFORCE_EQ(ptr->get(), nullptr,
+                      platform::errors::AlreadyExists(
+                          "This value has already in global map."));
+    ValueT* item = new ValueT(std::forward<Args>(args)...);
+    ptr->reset(item);
+    return item;
+  }
+
+ private:
+  static std::unique_ptr<ValueT>* GetPPtr(KeyT id) {
+    static std::unordered_map<KeyT, std::unique_ptr<ValueT>> id_to_ptr;
+    return &id_to_ptr[id];
+  }
+};
+
+template <typename KeyT, typename ValueT>
+class ThreadSafeGlobalMap final {
+ public:
+  static ValueT* Get(KeyT id) {
+    ValueT* item = GetPPtr(id)->get();
+    PADDLE_ENFORCE_NOT_NULL(
+        item, platform::errors::NotFound(
+                  "This value is not in thread safe global map."));
+    return item;
+  }
+  template <typename... Args>
+  static ValueT* Create(KeyT id, Args&&... args) {
+    auto* ptr = GetPPtr(id);
+    PADDLE_ENFORCE_EQ(ptr->get(), nullptr,
+                      platform::errors::AlreadyExists(
+                          "This value has already in thread safe global map."));
+    ValueT* item = new ValueT(std::forward<Args>(args)...);
+    ptr->reset(item);
+    return item;
+  }
+
+ private:
+  static std::unique_ptr<ValueT>* GetPPtr(KeyT id) {
+    static std::mutex mutex;
+    static std::unordered_map<KeyT, std::unique_ptr<ValueT>> id_to_ptr;
+    std::unique_lock<std::mutex> lock(mutex);
+    return &id_to_ptr[id];
+  }
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc
index d649a84614e4d..710ebda41244e 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc
@@ -14,26 +14,21 @@
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/task_loop.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 
 namespace paddle {
 namespace distributed {
 
 Interceptor::Interceptor(int64_t interceptor_id, TaskNode* node)
-    : interceptor_id_(interceptor_id), node_(node) {
-  interceptor_thread_ = std::thread([this]() {
-    VLOG(3) << "Interceptor " << interceptor_id_
-            << " starts the thread pooling it's local mailbox.";
-    PoolTheMailbox();
-  });
-}
-
-Interceptor::~Interceptor() { Join(); }
-
-void Interceptor::Join() {
-  if (interceptor_thread_.joinable()) {
-    interceptor_thread_.join();
-  }
+    : interceptor_id_(interceptor_id), node_(node) {}
+
+Interceptor::~Interceptor() {
+  // FIXME(wangxi): throw in stop function
+  // std::lock_guard<std::mutex> lock(mutex_);
+  // PADDLE_ENFORCE_EQ(messages_.empty(), true,
+  //                  platform::errors::PreconditionNotMet(
+  //                      "Interceptor must destruct with messages empty"));
 }
 
 void Interceptor::RegisterMsgHandle(MsgHandle handle) { handle_ = handle; }
@@ -44,32 +39,47 @@ void Interceptor::Handle(const InterceptorMessage& msg) {
   handle_(msg);
 }
 
-void Interceptor::StopCarrier() {
-  PADDLE_ENFORCE_NOT_NULL(carrier_, platform::errors::PreconditionNotMet(
-                                        "Carrier is not registered."));
-  std::condition_variable& cond_var = carrier_->GetCondVar();
-  // probably double notify, but ok for ut
-  cond_var.notify_all();
-}
+void Interceptor::LoopOnce() {
+  std::deque<InterceptorMessage> tmp_messages;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    messages_.swap(tmp_messages);
+  }
+  PADDLE_ENFORCE_EQ(tmp_messages.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "tmp_messages must not empty in task loop"));
+
+  for (auto& msg : tmp_messages) {
+    const MessageType message_type = msg.message_type();
+    VLOG(3) << "Interceptor " << interceptor_id_ << " has received a message"
+            << " from interceptor " << msg.src_id()
+            << " with message: " << message_type << ".";
 
-std::condition_variable& Interceptor::GetCondVar() {
-  // get the conditional var
-  return cond_var_;
+    Handle(msg);
+  }
 }
 
-int64_t Interceptor::GetInterceptorId() const {
-  // return the interceptor id
-  return interceptor_id_;
+void Interceptor::StopCarrier() {
+  PADDLE_ENFORCE_NOT_NULL(carrier_, platform::errors::PreconditionNotMet(
+                                        "Carrier is not registered."));
+  carrier_->WakeUp();
 }
 
-bool Interceptor::EnqueueRemoteInterceptorMessage(
-    const InterceptorMessage& interceptor_message) {
+void Interceptor::EnqueueRemoteInterceptorMessage(
+    const InterceptorMessage& message) {
   // Called by Carrier, enqueue an InterceptorMessage to remote mailbox
-  VLOG(3) << "Enqueue message: " << interceptor_message.message_type()
-          << " into " << interceptor_id_ << "'s remote mailbox.";
-  std::unique_lock<std::mutex> lock(remote_mailbox_mutex_);
-  remote_mailbox_.push(interceptor_message);
-  return true;
+  VLOG(3) << "Enqueue message: " << message.message_type() << " into "
+          << interceptor_id_ << "'s remote mailbox.";
+
+  bool empty = false;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    empty = messages_.empty();
+    messages_.emplace_back(message);
+  }
+  if (empty) {
+    loop_->QueueInLoop([this]() { LoopOnce(); });
+  }
 }
 
 bool Interceptor::Send(int64_t dst_id, InterceptorMessage& msg) {
@@ -80,50 +90,6 @@ bool Interceptor::Send(int64_t dst_id, InterceptorMessage& msg) {
   return carrier_->Send(msg);
 }
 
-void Interceptor::PoolTheMailbox() {
-  // pool the local mailbox, parse the Message
-  for (;;) {
-    if (local_mailbox_.empty()) {
-      // local mailbox is empty, fetch the remote mailbox
-      VLOG(3) << interceptor_id_ << "'s local mailbox is empty. "
-              << "Fetch the remote mailbox.";
-      PADDLE_ENFORCE_EQ(FetchRemoteMailbox(), true,
-                        platform::errors::InvalidArgument(
-                            "Error encountered when fetch remote mailbox."));
-    }
-    const InterceptorMessage interceptor_message = local_mailbox_.front();
-    local_mailbox_.pop();
-    const MessageType message_type = interceptor_message.message_type();
-    VLOG(3) << "Interceptor " << interceptor_id_ << " has received a message"
-            << " from interceptor " << interceptor_message.src_id()
-            << " with message: " << message_type << ".";
-
-    Handle(interceptor_message);
-
-    if (stop_) {
-      // break the pooling thread
-      VLOG(3) << "Interceptor " << interceptor_id_ << " is quiting.";
-      break;
-    }
-  }
-}
-
-bool Interceptor::FetchRemoteMailbox() {
-  // fetch all Message from remote mailbox to local mailbox
-  // return true if remote mailbox not empty, otherwise return false
-  std::unique_lock<std::mutex> lock(remote_mailbox_mutex_);
-  cond_var_.wait(lock, [this]() { return !remote_mailbox_.empty(); });
-  if (remote_mailbox_.empty()) {
-    // the thread has been unblocked accidentally
-    return false;
-  }
-  while (!remote_mailbox_.empty()) {
-    local_mailbox_.push(std::move(remote_mailbox_.front()));
-    remote_mailbox_.pop();
-  }
-  return true;
-}
-
 static InterceptorFactory::CreateInterceptorMap& GetInterceptorMap() {
   static InterceptorFactory::CreateInterceptorMap interceptorMap;
   return interceptorMap;
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index bc20058074441..cb7ff2da89a9d 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -15,14 +15,15 @@
 #pragma once
 
 #include <condition_variable>
+#include <deque>
 #include <functional>
 #include <map>
 #include <memory>
-#include <queue>
 #include <thread>
 #include <vector>
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
@@ -37,6 +38,7 @@ namespace distributed {
 
 class TaskNode;
 class Carrier;
+class TaskLoop;
 
 class Interceptor {
  public:
@@ -49,21 +51,16 @@ class Interceptor {
 
   virtual ~Interceptor();
 
-  void Join();
-
   // register interceptor handle
   void RegisterMsgHandle(MsgHandle handle);
 
   void Handle(const InterceptorMessage& msg);
 
   // return the interceptor id
-  int64_t GetInterceptorId() const;
-
-  // return the conditional var
-  std::condition_variable& GetCondVar();
+  int64_t GetInterceptorId() const { return interceptor_id_; }
 
   // Called by Carrier, enqueue an InterceptorMessage to remote mailbox
-  bool EnqueueRemoteInterceptorMessage(
+  void EnqueueRemoteInterceptorMessage(
       const InterceptorMessage& interceptor_message);
 
   bool Send(int64_t dst_id, InterceptorMessage& msg);  // NOLINT
@@ -79,6 +76,7 @@ class Interceptor {
     gc_ = gc;
   }
   void RegisterCarrier(Carrier* carrier) { carrier_ = carrier; }
+  void RegisterTaskLoop(TaskLoop* loop) { loop_ = loop; }
 
   TaskNode* GetTaskNode() const { return node_; }
 
@@ -103,35 +101,16 @@ class Interceptor {
   std::shared_ptr<framework::GarbageCollector> gc_{nullptr};
 
   Carrier* carrier_;
+  TaskLoop* loop_;
 
  private:
-  // pool the local mailbox, parse the Message
-  void PoolTheMailbox();
-
-  // fetch all Message from remote mailbox to local mailbox
-  // return true if remote mailbox not empty, otherwise return false
-  bool FetchRemoteMailbox();
+  void LoopOnce();
 
   // interceptor handle which process message
   MsgHandle handle_{nullptr};
 
-  // mutex to control read/write conflict for remote mailbox
-  std::mutex remote_mailbox_mutex_;
-
-  // interceptor runs PoolTheMailbox() function to poll local mailbox
-  std::thread interceptor_thread_;
-
-  // conditional variable for blocking the thread when
-  // fetch an empty remote mailbox
-  std::condition_variable cond_var_;
-
-  // remote mailbox, written by EnqueueRemoteMessage()
-  // read by FetchRemoteMailbox()
-  std::queue<InterceptorMessage> remote_mailbox_;
-
-  // local mailbox, written by FetchRemoteMailbox()
-  // read by PoolTheMailbox()
-  std::queue<InterceptorMessage> local_mailbox_;
+  std::mutex mutex_;
+  std::deque<InterceptorMessage> messages_;
 
   int64_t already_run_times_{0};
   int64_t used_slot_nums_{0};
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
index c9ab477183a31..ed38894641c3a 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
+++ b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
@@ -34,7 +34,8 @@ message InterceptorMessage {
 
 message InterceptorResponse { optional bool rst = 1 [ default = false ]; }
 
-service TheInterceptorMessageService {
-  rpc InterceptorMessageService(InterceptorMessage)
+service MessageService {
+  rpc ReceiveInterceptorMessage(InterceptorMessage)
       returns (InterceptorResponse);
+  rpc IncreaseBarrierCount(InterceptorMessage) returns (InterceptorResponse);
 }
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index d4c986de5a03c..8d2ec5c41d864 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -18,7 +18,7 @@
 #include <thread>
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
@@ -26,16 +26,25 @@ namespace paddle {
 namespace distributed {
 
 void MessageBus::Init(
-    const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
-    const std::unordered_map<int64_t, std::string>& rank_to_addr,
+    int64_t rank, const std::unordered_map<int64_t, std::string>& rank_to_addr,
     const std::string& addr) {
   PADDLE_ENFORCE_EQ(is_init_, false, platform::errors::AlreadyExists(
                                          "MessageBus is already init."));
+  rank_ = rank;
   is_init_ = true;
-  interceptor_id_to_rank_ = interceptor_id_to_rank;
   rank_to_addr_ = rank_to_addr;
   addr_ = addr;
 
+  if (addr_ != "") {
+    const auto& addr = GetAddr(rank_);
+    PADDLE_ENFORCE_EQ(addr, addr_,
+                      platform::errors::Fatal(
+                          "The current rank's addr is %s, while the "
+                          "message bus's addr is %s, which are different. "
+                          "Init error.",
+                          addr, addr_));
+  }
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
   // NOTE: To make the brpc is compatible with collective,
@@ -65,43 +74,100 @@ MessageBus::~MessageBus() {
 #endif
 }
 
-bool MessageBus::Send(const InterceptorMessage& interceptor_message) {
-  // called by Interceptor, send InterceptorMessage to dst
-  int64_t src_id = interceptor_message.src_id();
-  int64_t dst_id = interceptor_message.dst_id();
-  if (IsSameRank(src_id, dst_id)) {
-    VLOG(3) << "Send a message from interceptor " << src_id
-            << " to interceptor " << dst_id << ", which are in the same ranks.";
-    return SendIntraRank(interceptor_message);
-  } else {
-    VLOG(3) << "Send a message from interceptor " << src_id
-            << " to interceptor " << dst_id
-            << ", which are in different ranks.";
+const std::string& MessageBus::GetAddr(int64_t rank) const {
+  PADDLE_ENFORCE_NE(
+      rank_to_addr_.find(rank), rank_to_addr_.end(),
+      platform::errors::NotFound("Cannot find addr rank id %lld.", rank));
+  return rank_to_addr_.at(rank);
+}
+
+bool MessageBus::Send(int64_t dst_rank,
+                      const InterceptorMessage& interceptor_message) {
+  PADDLE_ENFORCE_EQ(
+      IsInit(), true,
+      platform::errors::PreconditionNotMet(
+          "Using message bus since it has not been initialized."));
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
-    int retry_time = 0;  // message bus will retry sending for 10 times
-    while (retry_time < 10) {
-      ++retry_time;
-      if (SendInterRank(interceptor_message)) {
-        VLOG(3) << "Message bus sends inter rank successfully with "
-                << retry_time << " times retries.";
-        return true;
-      }
-      VLOG(3) << "Message bus sends failed, retry after 1 seconds.";
-      std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  int retry_time = 0;  // message bus will retry sending for 10 times
+  while (retry_time < 10) {
+    ++retry_time;
+    if (SendInterRank(dst_rank, interceptor_message)) {
+      VLOG(3) << "Message bus sends inter rank successfully with " << retry_time
+              << " times retries.";
+      return true;
     }
-    VLOG(3) << "Message bus sends inter rank fail after 10 times retries.";
-    return false;
+    VLOG(3) << "Message bus sends failed, retry after 1 seconds.";
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  }
+  VLOG(3) << "Message bus sends inter rank fail after 10 times retries.";
+  return false;
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Fleet executor does not support sending message between different "
-        "ranks when Paddle is compiled with npu or "
-        "isn't compiled with distributed for now."));
+  PADDLE_THROW(platform::errors::Unavailable(
+      "Fleet executor does not support sending message between different "
+      "ranks when Paddle is compiled with npu or "
+      "isn't compiled with distributed for now."));
 #endif
-  }
   return true;
 }
 
+void MessageBus::IncreaseBarrierCount() {
+  VLOG(3) << "IncreaseBarrierCount";
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    ++count_;
+    cv_.notify_one();
+  }
+  VLOG(3) << "End IncreaseBarrierCount";
+}
+
+void MessageBus::Barrier() {
+  // gather to root
+  if (rank_ != 0) {
+    InterceptorMessage ctrl_msg;
+    ctrl_msg.set_ctrl_message(true);
+    ctrl_msg.set_src_id(rank_);
+    ctrl_msg.set_dst_id(0);
+    VLOG(3) << "Barrier Gather ctrl message from " << rank_ << " to 0";
+    while (!Send(0, ctrl_msg)) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    }
+  } else {
+    VLOG(3) << "Barrier 0 wait others rank ready";
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [this] {
+      return count_ == static_cast<int>(rank_to_addr_.size() - 1);
+    });
+    count_ = 0;
+  }
+
+  // scatter from root
+  if (rank_ == 0) {
+    for (int i = 1; i < static_cast<int>(rank_to_addr_.size()); ++i) {
+      InterceptorMessage ctrl_msg;
+      ctrl_msg.set_ctrl_message(true);
+      ctrl_msg.set_src_id(0);
+      ctrl_msg.set_dst_id(i);
+      VLOG(3) << "Barrier Scatter ctrl message from 0 to " << i;
+      while (!Send(i, ctrl_msg)) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+      }
+    }
+  } else {
+    VLOG(3) << "Barrier " << rank_ << " wait others rank ready";
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [this] { return count_ == 1; });
+    count_ = 0;
+  }
+}
+
+bool MessageBus::DispatchMsgToCarrier(
+    const InterceptorMessage& interceptor_message) {
+  const std::string& carrier_id = *GlobalVal<std::string>::Get();
+  return GlobalMap<std::string, Carrier>::Get(carrier_id)
+      ->EnqueueInterceptorMessage(interceptor_message);
+}
+
 void MessageBus::ListenPort() {
   if (addr_ == "") {
     LOG(INFO) << "No need listen to port since training on single card.";
@@ -110,10 +176,9 @@ void MessageBus::ListenPort() {
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
   // function keep listen the port and handle the message
-  PADDLE_ENFORCE_EQ(server_.AddService(&interceptor_message_service_,
-                                       brpc::SERVER_DOESNT_OWN_SERVICE),
-                    0, platform::errors::Unavailable(
-                           "Message bus: init brpc service error."));
+  PADDLE_ENFORCE_EQ(
+      server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0,
+      platform::errors::Unavailable("Message bus: init brpc service error."));
 
   // start the server
   const char* ip_for_brpc = addr_.c_str();
@@ -130,30 +195,6 @@ void MessageBus::ListenPort() {
     interval += 500;
   }
   LOG(INFO) << "Message bus's listen port thread starts successful.";
-
-  std::set<int64_t> visit;
-  InterceptorMessage tmp_msg;
-  tmp_msg.set_ctrl_message(true);
-  for (auto pair : interceptor_id_to_rank_) {
-    if (rank_to_addr_.at(pair.second) == addr_) {
-      tmp_msg.set_src_id(pair.first);
-    }
-  }
-  for (auto pair : interceptor_id_to_rank_) {
-    int64_t rank = pair.second;
-    if (rank_to_addr_.at(rank) == addr_) {
-      continue;
-    }
-    tmp_msg.set_dst_id(pair.first);
-    if (visit.find(rank) == visit.end()) {
-      VLOG(3) << "Message bus is testing connection for rank: " << rank << ".";
-      visit.insert(rank);
-      while (!Send(tmp_msg)) {
-        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-      }
-      VLOG(3) << "Message bus has connected to rank: " << rank << ".";
-    }
-  }
 #else
   LOG(WARNING)
       << "Fleet executor's ListenPort() is a fake function when Paddle is "
@@ -162,53 +203,13 @@ void MessageBus::ListenPort() {
 #endif
 }
 
-bool MessageBus::IsSameRank(int64_t src_id, int64_t dst_id) {
-  // -1 is sent by carrier to source interceptor
-  if (src_id == -1) src_id = dst_id;
-
-  // check whether the dst is the same rank or different rank with src
-  const auto& src_rank = interceptor_id_to_rank_.find(src_id);
-  const auto& dst_rank = interceptor_id_to_rank_.find(dst_id);
-  PADDLE_ENFORCE_NE(
-      src_rank, interceptor_id_to_rank_.end(),
-      platform::errors::NotFound(
-          "Cannot find rank for src interceptor id %lld. Init error.", src_id));
-  PADDLE_ENFORCE_NE(
-      dst_rank, interceptor_id_to_rank_.end(),
-      platform::errors::NotFound(
-          "Cannot find rank for dst interceptor id %lld. Init error.", dst_id));
-  if (addr_ == "") {
-    // single card training, must be same rank
-    return true;
-  }
-  const auto& src_ip = rank_to_addr_.find(src_rank->second);
-  PADDLE_ENFORCE_NE(src_ip, rank_to_addr_.end(),
-                    platform::errors::NotFound(
-                        "Cannot find addr for src rank id %lld. Init error.",
-                        src_rank->second));
-  PADDLE_ENFORCE_EQ(
-      src_ip->second, addr_,
-      platform::errors::Fatal("The src interceptor's addr is %s, while the "
-                              "message bus's addr is %s, which are different. "
-                              "Init error.",
-                              src_ip->second, addr_));
-  return src_rank->second == dst_rank->second;
-}
-
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
-bool MessageBus::SendInterRank(const InterceptorMessage& interceptor_message) {
-  // send the message inter rank (dst is different rank with src)
-  int64_t dst_id = interceptor_message.dst_id();
-  int64_t dst_rank = interceptor_id_to_rank_[dst_id];
-  auto dst_ip = rank_to_addr_.find(dst_rank);
-  PADDLE_ENFORCE_NE(dst_ip, rank_to_addr_.end(),
-                    platform::errors::InvalidArgument(
-                        "Cannot find rank for dst interceptor id %lld. "
-                        "Init error.",
-                        dst_id));
-  VLOG(3) << "Message bus sending to addr: " << dst_ip->second;
-  const char* dst_ip_for_brpc = dst_ip->second.c_str();
+bool MessageBus::SendInterRank(int64_t dst_rank,
+                               const InterceptorMessage& interceptor_message) {
+  const auto& dst_addr = GetAddr(dst_rank);
+  VLOG(3) << "Message bus sending to addr: " << dst_addr;
+  const char* dst_addr_for_brpc = dst_addr.c_str();
   brpc::Channel channel;
   brpc::ChannelOptions options;
   options.protocol = "baidu_std";
@@ -216,13 +217,18 @@ bool MessageBus::SendInterRank(const InterceptorMessage& interceptor_message) {
   options.timeout_ms = 1000;
   options.max_retry = 5;
   PADDLE_ENFORCE_EQ(
-      channel.Init(dst_ip_for_brpc, &options), 0,
+      channel.Init(dst_addr_for_brpc, &options), 0,
       platform::errors::Unavailable("Message bus: init brpc channel error."));
-  TheInterceptorMessageService_Stub stub(&channel);
+  MessageService_Stub stub(&channel);
   InterceptorResponse response;
   brpc::Controller ctrl;
   ctrl.set_log_id(0);
-  stub.InterceptorMessageService(&ctrl, &interceptor_message, &response, NULL);
+  if (interceptor_message.ctrl_message()) {
+    stub.IncreaseBarrierCount(&ctrl, &interceptor_message, &response, NULL);
+  } else {
+    stub.ReceiveInterceptorMessage(&ctrl, &interceptor_message, &response,
+                                   NULL);
+  }
   if (!ctrl.Failed()) {
     if (response.rst()) {
       VLOG(3) << "Message bus: brpc sends success.";
@@ -237,13 +243,8 @@ bool MessageBus::SendInterRank(const InterceptorMessage& interceptor_message) {
     return false;
   }
 }
-#endif
 
-bool MessageBus::SendIntraRank(const InterceptorMessage& interceptor_message) {
-  // send the message intra rank (dst is the same rank with src)
-  return FleetExecutor::GetCarrier().EnqueueInterceptorMessage(
-      interceptor_message);
-}
+#endif
 
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
index 3f151cab3a46c..d805ac81606b8 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <condition_variable>
 #include <mutex>
 #include <string>
 #include <thread>
@@ -23,7 +24,7 @@
     !defined(PADDLE_WITH_ASCEND_CL)
 #include "brpc/channel.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
+#include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #endif
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
@@ -42,14 +43,18 @@ class MessageBus final {
   MessageBus() = default;
   ~MessageBus();
 
-  void Init(const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
+  void Init(int64_t rank,
             const std::unordered_map<int64_t, std::string>& rank_to_addr,
             const std::string& addr);
 
   bool IsInit() const;
 
   // called by Interceptor, send InterceptorMessage to dst
-  bool Send(const InterceptorMessage& interceptor_message);
+  bool Send(int64_t dst_rank, const InterceptorMessage& interceptor_message);
+
+  void IncreaseBarrierCount();
+  void Barrier();
+  bool DispatchMsgToCarrier(const InterceptorMessage& interceptor_message);
 
  private:
   DISABLE_COPY_AND_ASSIGN(MessageBus);
@@ -57,22 +62,18 @@ class MessageBus final {
   // function keep listen the port and handle the message
   void ListenPort();
 
-  // check whether the dst is the same rank or different rank with src
-  bool IsSameRank(int64_t src_id, int64_t dst_id);
+  const std::string& GetAddr(int64_t rank) const;
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
   // send the message inter rank (dst is different rank with src)
-  bool SendInterRank(const InterceptorMessage& interceptor_message);
+  bool SendInterRank(int64_t dst_rank,
+                     const InterceptorMessage& interceptor_message);
 #endif
 
   bool is_init_{false};
 
-  // send the message intra rank (dst is the same rank with src)
-  bool SendIntraRank(const InterceptorMessage& interceptor_message);
-
-  // handed by above layer, save the info mapping interceptor id to rank id
-  std::unordered_map<int64_t, int64_t> interceptor_id_to_rank_;
+  int64_t rank_;
 
   // handed by above layer, save the info mapping rank id to addr
   std::unordered_map<int64_t, std::string> rank_to_addr_;
@@ -82,10 +83,15 @@ class MessageBus final {
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
-  InterceptorMessageServiceImpl interceptor_message_service_;
+  MessageServiceImpl message_service_;
   // brpc server
   brpc::Server server_;
 #endif
+
+  // for barrier
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  int count_{0};
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
similarity index 59%
rename from paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
rename to paddle/fluid/distributed/fleet_executor/message_service.cc
index a8d29758ca163..c3fff98f684ad 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -13,23 +13,34 @@
 // limitations under the License.
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
+#include "paddle/fluid/distributed/fleet_executor/message_service.h"
 #include "brpc/server.h"
-#include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 
 namespace paddle {
 namespace distributed {
 
-void InterceptorMessageServiceImpl::InterceptorMessageService(
+void MessageServiceImpl::ReceiveInterceptorMessage(
     google::protobuf::RpcController* control_base,
     const InterceptorMessage* request, InterceptorResponse* response,
     google::protobuf::Closure* done) {
   brpc::ClosureGuard done_guard(done);
-  VLOG(3) << "Interceptor Message Service receives a message from interceptor "
+  VLOG(3) << "Message Service receives a message from interceptor "
           << request->src_id() << " to interceptor " << request->dst_id()
           << ", with the message: " << request->message_type();
-  FleetExecutor::GetCarrier().EnqueueInterceptorMessage(*request);
+  bool flag = GlobalVal<MessageBus>::Get()->DispatchMsgToCarrier(*request);
+  response->set_rst(flag);
+}
+
+void MessageServiceImpl::IncreaseBarrierCount(
+    google::protobuf::RpcController* control_base,
+    const InterceptorMessage* request, InterceptorResponse* response,
+    google::protobuf::Closure* done) {
+  brpc::ClosureGuard done_guard(done);
+  VLOG(3) << "Barrier Service receives a message from rank "
+          << request->src_id() << " to rank " << request->dst_id();
+  GlobalVal<MessageBus>::Get()->IncreaseBarrierCount();
   response->set_rst(true);
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h
similarity index 75%
rename from paddle/fluid/distributed/fleet_executor/interceptor_message_service.h
rename to paddle/fluid/distributed/fleet_executor/message_service.h
index 0a8dfc861a910..02f73471e3b91 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message_service.h
+++ b/paddle/fluid/distributed/fleet_executor/message_service.h
@@ -21,11 +21,15 @@
 namespace paddle {
 namespace distributed {
 
-class InterceptorMessageServiceImpl : public TheInterceptorMessageService {
+class MessageServiceImpl : public MessageService {
  public:
-  InterceptorMessageServiceImpl() {}
-  virtual ~InterceptorMessageServiceImpl() {}
-  virtual void InterceptorMessageService(
+  MessageServiceImpl() {}
+  virtual ~MessageServiceImpl() {}
+  virtual void ReceiveInterceptorMessage(
+      google::protobuf::RpcController* control_base,
+      const InterceptorMessage* request, InterceptorResponse* response,
+      google::protobuf::Closure* done);
+  virtual void IncreaseBarrierCount(
       google::protobuf::RpcController* control_base,
       const InterceptorMessage* request, InterceptorResponse* response,
       google::protobuf::Closure* done);
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 1ad144470af26..614b4c37e8254 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -21,7 +21,7 @@ namespace distributed {
 std::string RuntimeGraph::DebugString() const {
   std::ostringstream os;
   os << "\nRuntime Graph Debug: \n";
-  for (const auto& pair : intercepter_id_to_node_) {
+  for (const auto& pair : interceptor_id_to_node_) {
     os << pair.second->DebugString();
     os << "\n";
   }
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.h b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
index 3678e2e860a9d..1ca9f0174ed07 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.h
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
@@ -29,26 +29,26 @@ class RuntimeGraph final {
  public:
   RuntimeGraph() = default;
   ~RuntimeGraph() = default;
-  const std::unordered_map<int64_t, TaskNode*>& intercepter_id_to_node() const {
-    return intercepter_id_to_node_;
+  const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node() const {
+    return interceptor_id_to_node_;
   }
-  const std::unordered_map<int64_t, int64_t>& intercepter_id_to_rank() const {
-    return intercepter_id_to_rank_;
+  const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank() const {
+    return interceptor_id_to_rank_;
   }
   void SetInterceptorIdToRank(
-      const std::unordered_map<int64_t, int64_t>& intercepter_id_to_rank) {
-    intercepter_id_to_rank_ = intercepter_id_to_rank;
+      const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank) {
+    interceptor_id_to_rank_ = interceptor_id_to_rank;
   }
   void SetInterceptorIdToNode(
-      const std::unordered_map<int64_t, TaskNode*>& intercepter_id_to_node) {
-    intercepter_id_to_node_ = intercepter_id_to_node;
+      const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node) {
+    interceptor_id_to_node_ = interceptor_id_to_node;
   }
   std::string DebugString() const;
 
  private:
   DISABLE_COPY_AND_ASSIGN(RuntimeGraph);
-  std::unordered_map<int64_t, TaskNode*> intercepter_id_to_node_;
-  std::unordered_map<int64_t, int64_t> intercepter_id_to_rank_;
+  std::unordered_map<int64_t, TaskNode*> interceptor_id_to_node_;
+  std::unordered_map<int64_t, int64_t> interceptor_id_to_rank_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop.cc b/paddle/fluid/distributed/fleet_executor/task_loop.cc
new file mode 100644
index 0000000000000..bfe9a939b966c
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/task_loop.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/task_loop.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+
+namespace paddle {
+namespace distributed {
+
+thread_local TaskLoop* TaskLoop::thread_local_loop_ = nullptr;
+
+TaskLoop* TaskLoop::GetTaskLoopOfCurrentThread() { return thread_local_loop_; }
+
+TaskLoop::TaskLoop()
+    : looping_(false), quit_(false), thread_id_(std::this_thread::get_id()) {
+  PADDLE_ENFORCE_EQ(
+      thread_local_loop_, nullptr,
+      platform::errors::AlreadyExists("Another TaskLoop is already init."));
+  thread_local_loop_ = this;
+}
+
+TaskLoop::~TaskLoop() { thread_local_loop_ = nullptr; }
+
+void TaskLoop::Loop() {
+  PADDLE_ENFORCE_EQ(looping_, false,
+                    platform::errors::PreconditionNotMet(
+                        "Loop can only execute in one loop thread"));
+  AssertInLoopThread();
+
+  looping_ = true;
+  quit_ = false;
+
+  while (!quit_) {
+    auto tasks = tasks_.PopAll();
+    for (auto& task : tasks) {
+      task();
+    }
+  }
+  looping_ = false;
+}
+
+void TaskLoop::Quit() {
+  quit_ = true;
+  if (!IsInLoopThread()) WakeUp();
+}
+
+void TaskLoop::RunInLoop(Functor cb) {
+  if (IsInLoopThread()) {
+    cb();
+  } else {
+    QueueInLoop(cb);
+  }
+}
+
+void TaskLoop::QueueInLoop(Functor cb) { tasks_.Push(cb); }
+
+void TaskLoop::WakeUp() {
+  Functor task([] {});
+  QueueInLoop(task);
+}
+
+void TaskLoop::AbortNotInLoopThread() {
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "This TaskLoop was created in thread %d, but current thread is %d",
+      thread_id_, std::this_thread::get_id()));
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop.h b/paddle/fluid/distributed/fleet_executor/task_loop.h
new file mode 100644
index 0000000000000..ddf8d292d4078
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/task_loop.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <future>
+#include <map>
+#include <thread>
+#include <vector>
+
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace distributed {
+
+class TaskLoop {
+ public:
+  static TaskLoop* GetTaskLoopOfCurrentThread();
+
+  using Functor = std::function<void()>;
+
+  TaskLoop();
+  ~TaskLoop();
+
+  void Loop();
+  void Quit();
+
+  void RunInLoop(Functor cb);
+  void QueueInLoop(Functor cb);
+
+  template <class F, class... Args>
+  auto Enqueue(F&& f, Args&&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type> {
+    using return_type = typename std::result_of<F(Args...)>::type;
+
+    auto task = std::make_shared<std::packaged_task<return_type()>>(
+        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+    std::future<return_type> task_future = task->get_future();
+
+    tasks_.Push([task]() { (*task)(); });
+    return task_future;
+  }
+
+  void WakeUp();
+
+  bool IsInLoopThread() const {
+    return thread_id_ == std::this_thread::get_id();
+  }
+
+  void AssertInLoopThread() {
+    if (!IsInLoopThread()) {
+      AbortNotInLoopThread();
+    }
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(TaskLoop);
+
+  void AbortNotInLoopThread();
+
+  static thread_local TaskLoop* thread_local_loop_;
+
+  bool looping_;
+  std::atomic<bool> quit_;
+  std::thread::id thread_id_;
+
+  framework::BlockingQueue<Functor> tasks_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
new file mode 100644
index 0000000000000..bb313ad37890d
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/task_loop_thread.h"
+
+#include "paddle/fluid/distributed/fleet_executor/task_loop.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+
+namespace paddle {
+namespace distributed {
+
+TaskLoopThread::TaskLoopThread() : start_(false), loop_(nullptr) {}
+
+TaskLoopThread::~TaskLoopThread() {
+  if (loop_ != nullptr) {
+    loop_->Quit();
+    thread_.join();
+  }
+}
+
+TaskLoop* TaskLoopThread::StartLoop() {
+  PADDLE_ENFORCE_EQ(start_, false, platform::errors::PreconditionNotMet(
+                                       "thread is already running."));
+  start_ = true;
+  thread_ = std::thread([this]() { Loop(); });
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  cv_.wait(lock, [=] { return loop_ != nullptr; });
+  return loop_;
+}
+
+void TaskLoopThread::Loop() {
+  TaskLoop loop;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    loop_ = &loop;
+    cv_.notify_one();
+  }
+  loop.Loop();
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  loop_ = nullptr;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread.h b/paddle/fluid/distributed/fleet_executor/task_loop_thread.h
new file mode 100644
index 0000000000000..ad5e99a5dec17
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace distributed {
+
+class TaskLoop;
+
+class TaskLoopThread {
+ public:
+  TaskLoopThread();
+  ~TaskLoopThread();
+
+  TaskLoop* StartLoop();
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(TaskLoopThread);
+
+  void Loop();
+
+  bool start_;
+  TaskLoop* loop_;
+  std::thread thread_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
new file mode 100644
index 0000000000000..ed34bbb87fc6b
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.h"
+
+#include "paddle/fluid/distributed/fleet_executor/task_loop.h"
+#include "paddle/fluid/distributed/fleet_executor/task_loop_thread.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+
+namespace paddle {
+namespace distributed {
+
+TaskLoopThreadPool::TaskLoopThreadPool() : TaskLoopThreadPool(1) {}
+
+TaskLoopThreadPool::TaskLoopThreadPool(int thread_num)
+    : start_(false), thread_num_(thread_num) {}
+
+TaskLoopThreadPool::~TaskLoopThreadPool() = default;
+
+void TaskLoopThreadPool::Start() {
+  PADDLE_ENFORCE_EQ(start_, false, platform::errors::PreconditionNotMet(
+                                       "thread pool is already start."));
+  PADDLE_ENFORCE_GT(
+      thread_num_, 0,
+      platform::errors::InvalidArgument(
+          "thread num must greater than 0, but now is %d", thread_num_));
+
+  start_ = true;
+  for (int i = 0; i < thread_num_; ++i) {
+    threads_.emplace_back(new TaskLoopThread());
+    loops_.push_back(threads_[i]->StartLoop());
+  }
+}
+
+TaskLoop* TaskLoopThreadPool::GetLoop(int tid) {
+  PADDLE_ENFORCE_EQ(start_, true, platform::errors::PreconditionNotMet(
+                                      "thread pool must start first."));
+  PADDLE_ENFORCE_GE(tid, 0, platform::errors::OutOfRange(
+                                "tid must >= 0, but now is %d", tid));
+  PADDLE_ENFORCE_LT(tid, thread_num_,
+                    platform::errors::OutOfRange(
+                        "tid must < thread_num, but now tid=%d thread_num=%d",
+                        tid, thread_num_));
+  return loops_[tid];
+}
+
+std::vector<TaskLoop*> TaskLoopThreadPool::GetAllLoops() {
+  PADDLE_ENFORCE_EQ(start_, true, platform::errors::PreconditionNotMet(
+                                      "thread pool must start first."));
+  return loops_;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.h b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.h
new file mode 100644
index 0000000000000..559a83ef5a86a
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace distributed {
+
+class TaskLoop;
+class TaskLoopThread;
+
+class TaskLoopThreadPool {
+ public:
+  TaskLoopThreadPool();
+  explicit TaskLoopThreadPool(int thread_num);
+  ~TaskLoopThreadPool();
+
+  void SetThreadNum(int thread_num) { thread_num_ = thread_num; }
+
+  void Start();
+
+  TaskLoop* GetLoop(int tid);
+  std::vector<TaskLoop*> GetAllLoops();
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(TaskLoopThreadPool);
+
+  bool start_;
+  int thread_num_;
+  std::vector<std::unique_ptr<TaskLoopThread>> threads_;
+  std::vector<TaskLoop*> loops_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index e56696d35f2a4..07d2a0f6b727a 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -62,12 +62,13 @@ TEST(ComputeInterceptor, Compute) {
   std::vector<framework::Scope*> scopes = {scope, scope};
   platform::Place place = platform::CPUPlace();
 
-  // TODO(liyurui): Remove singleton when move SendIntra into Carrier
-  Carrier& carrier = FleetExecutor::GetCarrier();
+  std::string carrier_id = "0";
+  Carrier* carrier =
+      GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+  carrier->Init(0, {{0, 0}, {1, 0}});
 
-  auto msg_bus = std::make_shared<MessageBus>();
-  msg_bus->Init({{0, 0}, {1, 0}}, {{0, "127.0.0.0:0"}}, "");
-  carrier.SetMsgBus(msg_bus);
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
+  msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // FIXME: don't delete, otherwise interceptor will use undefined node
   TaskNode* node_a =
@@ -78,24 +79,22 @@ TEST(ComputeInterceptor, Compute) {
   node_a->AddDownstreamTask(1);
   node_b->AddUpstreamTask(0);
 
-  auto* a = carrier.SetInterceptor(
+  auto* a = carrier->SetInterceptor(
       0, InterceptorFactory::Create("Compute", 0, node_a));
-  carrier.SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
+  carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
 
   a->SetPlace(place);
   a->SetMicroBatchScope(scopes);
 
-  carrier.SetCreatingFlag(false);
-
   // start
   InterceptorMessage msg;
   msg.set_message_type(DATA_IS_READY);
   msg.set_src_id(-1);
   msg.set_dst_id(0);
-  carrier.EnqueueInterceptorMessage(msg);
+  carrier->EnqueueInterceptorMessage(msg);
 
-  carrier.Wait();
-  carrier.Release();
+  carrier->Wait();
+  carrier->Release();
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index 3bd2ddec4effc..954b52693f46c 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -47,12 +47,13 @@ class StartInterceptor : public Interceptor {
 };
 
 TEST(ComputeInterceptor, Compute) {
-  // TODO(liyurui): Remove singleton when move SendIntra into Carrier
-  Carrier& carrier = FleetExecutor::GetCarrier();
+  std::string carrier_id = "0";
+  Carrier* carrier =
+      GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+  carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}});
 
-  auto msg_bus = std::make_shared<MessageBus>();
-  msg_bus->Init({{0, 0}, {1, 0}, {2, 0}}, {{0, "127.0.0.0:0"}}, "");
-  carrier.SetMsgBus(msg_bus);
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
+  msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
   TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
@@ -66,11 +67,9 @@ TEST(ComputeInterceptor, Compute) {
   node_c->AddUpstreamTask(1);
 
   Interceptor* a =
-      carrier.SetInterceptor(0, std::make_unique<StartInterceptor>(0, node_a));
-  carrier.SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
-  carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
-
-  carrier.SetCreatingFlag(false);
+      carrier->SetInterceptor(0, std::make_unique<StartInterceptor>(0, node_a));
+  carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
+  carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
 
   InterceptorMessage msg;
   msg.set_message_type(DATA_IS_READY);
@@ -79,8 +78,8 @@ TEST(ComputeInterceptor, Compute) {
   a->Send(1, msg);
   a->Send(1, msg);
 
-  carrier.Wait();
-  carrier.Release();
+  carrier->Wait();
+  carrier->Release();
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
index 8d9e609a24034..19c1d0a0d7a6a 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 
@@ -60,23 +60,22 @@ class PingPongInterceptor : public Interceptor {
 REGISTER_INTERCEPTOR(PingPong, PingPongInterceptor);
 
 TEST(InterceptorTest, PingPong) {
-  // TODO(liyurui): Remove singleton when move SendIntra into Carrier
-  Carrier& carrier = FleetExecutor::GetCarrier();
-
-  auto msg_bus = std::make_shared<MessageBus>();
-  msg_bus->Init({{0, 0}, {1, 0}}, {{0, "127.0.0.0:0"}}, "");
-  carrier.SetMsgBus(msg_bus);
-
-  Interceptor* a = carrier.SetInterceptor(
+  std::string carrier_id = "0";
+  Carrier* carrier =
+      GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+  carrier->Init(0, {{0, 0}, {1, 0}});
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
+  msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
+
+  Interceptor* a = carrier->SetInterceptor(
       0, InterceptorFactory::Create("PingPong", 0, nullptr));
 
-  carrier.SetInterceptor(1, std::make_unique<PingPongInterceptor>(1, nullptr));
-  carrier.SetCreatingFlag(false);
+  carrier->SetInterceptor(1, std::make_unique<PingPongInterceptor>(1, nullptr));
 
   InterceptorMessage msg;
   a->Send(1, msg);
 
-  carrier.Wait();
+  carrier->Wait();
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
index 93574609960a1..78cff2606f6b8 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 
@@ -104,35 +104,35 @@ TEST(InterceptorTest, PingPong) {
   std::string ip1 = "127.0.0.1:" + std::to_string(port1);
   std::cout << "ip0: " << ip0 << std::endl;
   std::cout << "ip1: " << ip1 << std::endl;
+  std::unordered_map<int64_t, int64_t> interceptor_id_to_rank = {{0, 0},
+                                                                 {1, 1}};
+  std::string carrier_id = "0";
 
   int pid = fork();
   if (pid == 0) {
-    auto msg_bus = std::make_shared<MessageBus>();
-    msg_bus->Init({{0, 0}, {1, 1}}, {{0, ip0}, {1, ip1}}, ip0);
-
-    // TODO(liyurui): Remove singleton when move SendIntra into Carrier
-    Carrier& carrier = FleetExecutor::GetCarrier();
-    carrier.SetMsgBus(msg_bus);
-
-    Interceptor* a = carrier.SetInterceptor(
+    Carrier* carrier =
+        GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+    GlobalVal<std::string>::Set(new std::string(carrier_id));
+    MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
+    msg_bus->Init(0, {{0, ip0}, {1, ip1}}, ip0);
+    carrier->Init(0, interceptor_id_to_rank);
+    Interceptor* a = carrier->SetInterceptor(
         0, InterceptorFactory::Create("PingPong", 0, nullptr));
-    carrier.SetCreatingFlag(false);
-
+    msg_bus->Barrier();
     InterceptorMessage msg;
     a->Send(1, msg);
-    carrier.Wait();
+    carrier->Wait();
   } else {
-    auto msg_bus = std::make_shared<MessageBus>();
-    msg_bus->Init({{0, 0}, {1, 1}}, {{0, ip0}, {1, ip1}}, ip1);
-
-    // TODO(liyurui): Remove singleton when move SendIntra into Carrier
-    Carrier& carrier = FleetExecutor::GetCarrier();
-    carrier.SetMsgBus(msg_bus);
-
-    carrier.SetInterceptor(1,
-                           InterceptorFactory::Create("PingPong", 1, nullptr));
-    carrier.SetCreatingFlag(false);
-    carrier.Wait();
+    Carrier* carrier =
+        GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+    GlobalVal<std::string>::Set(new std::string(carrier_id));
+    MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
+    msg_bus->Init(1, {{0, ip0}, {1, ip1}}, ip1);
+    carrier->Init(1, interceptor_id_to_rank);
+    carrier->SetInterceptor(1,
+                            InterceptorFactory::Create("PingPong", 1, nullptr));
+    msg_bus->Barrier();
+    carrier->Wait();
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
index cf66725a88f80..3860e9f4e137e 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -52,12 +52,12 @@ void LinkNodes(const std::vector<TaskNode*>& nodes) {
 }
 
 TEST(AmplifierInterceptor, Amplifier) {
-  // TODO(liyurui): Remove singleton when move SendIntra into Carrier
-  Carrier& carrier = FleetExecutor::GetCarrier();
-  auto msg_bus = std::make_shared<MessageBus>();
-  msg_bus->Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0}},
-                {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
-  carrier.SetMsgBus(msg_bus);
+  std::string carrier_id = "0";
+  Carrier* carrier =
+      GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+  carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0}});
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
+  msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
 
   int64_t micro_steps = 3;
 
@@ -76,23 +76,23 @@ TEST(AmplifierInterceptor, Amplifier) {
   node_b->SetReplyUpPerSteps(micro_steps);
   node_e->SetSendDownPerSteps(micro_steps);
 
-  carrier.SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
-  carrier.SetInterceptor(1, InterceptorFactory::Create("Amplifier", 1, node_b));
-  carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
-  carrier.SetInterceptor(3, InterceptorFactory::Create("Compute", 3, node_d));
-  carrier.SetInterceptor(4, InterceptorFactory::Create("Amplifier", 4, node_e));
-  carrier.SetInterceptor(5, InterceptorFactory::Create("Compute", 5, node_f));
-
-  carrier.SetCreatingFlag(false);
+  carrier->SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
+  carrier->SetInterceptor(1,
+                          InterceptorFactory::Create("Amplifier", 1, node_b));
+  carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier->SetInterceptor(3, InterceptorFactory::Create("Compute", 3, node_d));
+  carrier->SetInterceptor(4,
+                          InterceptorFactory::Create("Amplifier", 4, node_e));
+  carrier->SetInterceptor(5, InterceptorFactory::Create("Compute", 5, node_f));
 
   // start
   InterceptorMessage msg;
   msg.set_message_type(DATA_IS_READY);
   msg.set_src_id(-1);
   msg.set_dst_id(0);
-  carrier.EnqueueInterceptorMessage(msg);
-  carrier.Wait();
-  carrier.Release();
+  carrier->EnqueueInterceptorMessage(msg);
+  carrier->Wait();
+  carrier->Release();
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
index e2ca934b5b02f..b510b68e4e2ed 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
-#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
@@ -70,11 +70,12 @@ void LinkNodes(const std::vector<TaskNode*>& nodes,
 }
 
 TEST(AmplifierInterceptor, Amplifier) {
-  // TODO(liyurui): Remove singleton when move SendIntra into Carrier
-  Carrier& carrier = FleetExecutor::GetCarrier();
-  auto msg_bus = std::make_shared<MessageBus>();
-  msg_bus->Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}}, {{0, ""}}, "");
-  carrier.SetMsgBus(msg_bus);
+  std::string carrier_id = "0";
+  Carrier* carrier =
+      GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
+  carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}, {3, 0}});
+  MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
+  msg_bus->Init(0, {{0, ""}}, "");
 
   int64_t micro_steps = 6;
 
@@ -93,21 +94,21 @@ TEST(AmplifierInterceptor, Amplifier) {
   node_d->SetRunPerSteps(micro_steps);
   node_d->SetRunAtOffset(micro_steps - 1);
 
-  carrier.SetInterceptor(0, InterceptorFactory::Create("Amplifier", 0, node_a));
-  carrier.SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
-  carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
-  carrier.SetInterceptor(3, InterceptorFactory::Create("Amplifier", 3, node_d));
-
-  carrier.SetCreatingFlag(false);
+  carrier->SetInterceptor(0,
+                          InterceptorFactory::Create("Amplifier", 0, node_a));
+  carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
+  carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier->SetInterceptor(3,
+                          InterceptorFactory::Create("Amplifier", 3, node_d));
 
   // start
   InterceptorMessage msg;
   msg.set_message_type(DATA_IS_READY);
   msg.set_src_id(-1);
   msg.set_dst_id(0);
-  carrier.EnqueueInterceptorMessage(msg);
-  carrier.Wait();
-  carrier.Release();
+  carrier->EnqueueInterceptorMessage(msg);
+  carrier->Wait();
+  carrier->Release();
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index 92dcde99cccb0..6eb8462977b60 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -103,19 +103,17 @@ void SerializeLodTensor(framework::Variable* var,
   if (platform::is_cpu_place(tensor->place())) {
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
-    iobuf->append(reinterpret_cast<const char*>(tensor->data<void>()),
-                  data_len);
+    iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
   } else {
 #ifdef PADDLE_WITH_CUDA
     char* temp_ptr =
         new char[tensor->numel() * framework::SizeOfType(tensor->type())];
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(platform::CPUPlace(), temp_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        platform::CPUPlace(), temp_ptr,
+        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
     iobuf->append(reinterpret_cast<const char*>(temp_ptr), data_len);
@@ -147,19 +145,17 @@ void SerializeSelectedRows(framework::Variable* var,
   if (platform::is_cpu_place(tensor->place())) {
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
-    iobuf->append(reinterpret_cast<const char*>(tensor->data<void>()),
-                  data_len);
+    iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
   } else {
 #ifdef PADDLE_WITH_CUDA
     char* temp_ptr =
         new char[tensor->numel() * framework::SizeOfType(tensor->type())];
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(platform::CPUPlace(), temp_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        platform::CPUPlace(), temp_ptr,
+        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
     iobuf->append(reinterpret_cast<const char*>(temp_ptr), data_len);
diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h
index 8714918dc8ecb..9ea44310f3ecd 100644
--- a/paddle/fluid/distributed/service/communicator.h
+++ b/paddle/fluid/distributed/service/communicator.h
@@ -300,6 +300,10 @@ class Communicator {
   virtual void BarrierWithTable(uint32_t barrier_type) {
     auto rets = _worker_ptr->barrier(barrier_table_id_, barrier_type);
     rets.wait();
+    int status = rets.get();
+    PADDLE_ENFORCE_EQ(status, 0,
+                      platform::errors::InvalidArgument(
+                          "The ret status must be 0 when barrier with table"));
   }
 
   virtual void CreateC2CConnection(int pserver_timeout_ms,
diff --git a/paddle/fluid/distributed/service/heter_client.cc b/paddle/fluid/distributed/service/heter_client.cc
index d9ec6b21fd377..13016d60515dd 100644
--- a/paddle/fluid/distributed/service/heter_client.cc
+++ b/paddle/fluid/distributed/service/heter_client.cc
@@ -34,7 +34,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
   auto micro_id = -1;
   auto* tensor = var->GetMutable<framework::LoDTensor>();
   if (platform::is_cpu_place(tensor->place())) {
-    auto data = reinterpret_cast<const float*>(tensor->data<void>());
+    auto data = reinterpret_cast<const float*>(tensor->data());
     micro_id = static_cast<int>(data[0]);
   } else {
 #ifdef PADDLE_WITH_CUDA
@@ -43,11 +43,10 @@ int GetMicroId(const platform::DeviceContext& ctx,
     char* temp_ptr = temp.data();
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(platform::CPUPlace(), temp_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        platform::CPUPlace(), temp_ptr,
+        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
     micro_id = static_cast<int>(temp_ptr_float[0]);
 #endif
diff --git a/paddle/fluid/distributed/service/heter_server.h b/paddle/fluid/distributed/service/heter_server.h
index 5f062755c9242..201074810cf31 100644
--- a/paddle/fluid/distributed/service/heter_server.h
+++ b/paddle/fluid/distributed/service/heter_server.h
@@ -240,7 +240,7 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler {
                       platform::errors::InvalidArgument(
                           "Not find variable microbatch_id in scope."));
     auto* tensor = var->GetMutable<framework::LoDTensor>();
-    auto data = reinterpret_cast<const float*>(tensor->data<void>());
+    auto data = reinterpret_cast<const float*>(tensor->data());
     auto micro_id = static_cast<int>(data[0]);
 
     int minibatch_index = micro_id / 10;
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index e124160e712e0..143b24cf32647 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -220,33 +220,6 @@ int32_t CommonSparseTable::initialize_value() {
     shard_values_.emplace_back(shard);
   }
 
-  auto accessor = _config.accessor();
-  std::vector<uint64_t> feasigns;
-
-  for (size_t x = 0; x < accessor.fea_dim(); ++x) {
-    if (x % _shard_num == _shard_idx) {
-      feasigns.push_back(x);
-    }
-  }
-
-  VLOG(3) << "has " << feasigns.size() << " ids need to be pre inited";
-
-  auto buckets = bucket(feasigns.size(), 10);
-  for (int x = 0; x < 10; ++x) {
-    auto bucket_feasigns = buckets[x + 1] - buckets[x];
-    std::vector<uint64_t> ids(bucket_feasigns);
-    std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1],
-              ids.begin());
-
-    std::vector<uint32_t> fres;
-    fres.resize(ids.size(), 1);
-
-    auto pull_value = PullSparseValue(ids, fres, param_dim_);
-    std::vector<float> pulls;
-    pulls.resize(bucket_feasigns * param_dim_);
-    pull_sparse(pulls.data(), pull_value);
-  }
-
   return 0;
 }
 
diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/table/sparse_geo_table.cc
index 04cd1136382a4..655c4784156e8 100644
--- a/paddle/fluid/distributed/table/sparse_geo_table.cc
+++ b/paddle/fluid/distributed/table/sparse_geo_table.cc
@@ -46,5 +46,46 @@ int32_t SparseGeoTable::push_sparse(const uint64_t* keys, const float* values,
   return 0;
 }
 
+int32_t SparseGeoTable::initialize_value() {
+  auto common = _config.common();
+  shard_values_.reserve(task_pool_size_);
+
+  for (int x = 0; x < task_pool_size_; ++x) {
+    auto shard = std::make_shared<ValueBlock>(
+        value_names_, value_dims_, value_offsets_, value_idx_,
+        initializer_attrs_, common.entry());
+
+    shard_values_.emplace_back(shard);
+  }
+
+  auto accessor = _config.accessor();
+  std::vector<uint64_t> feasigns;
+
+  for (size_t x = 0; x < accessor.fea_dim(); ++x) {
+    if (x % _shard_num == _shard_idx) {
+      feasigns.push_back(x);
+    }
+  }
+
+  VLOG(3) << "has " << feasigns.size() << " ids need to be pre inited";
+
+  auto buckets = bucket(feasigns.size(), 10);
+  for (int x = 0; x < 10; ++x) {
+    auto bucket_feasigns = buckets[x + 1] - buckets[x];
+    std::vector<uint64_t> ids(bucket_feasigns);
+    std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1],
+              ids.begin());
+
+    std::vector<uint32_t> fres;
+    fres.resize(ids.size(), 1);
+
+    auto pull_value = PullSparseValue(ids, fres, param_dim_);
+    std::vector<float> pulls;
+    pulls.resize(bucket_feasigns * param_dim_);
+    pull_sparse(pulls.data(), pull_value);
+  }
+  return 0;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/table/sparse_geo_table.h b/paddle/fluid/distributed/table/sparse_geo_table.h
index 01870615af6fe..4ddb1fd706069 100644
--- a/paddle/fluid/distributed/table/sparse_geo_table.h
+++ b/paddle/fluid/distributed/table/sparse_geo_table.h
@@ -44,11 +44,13 @@ class SparseGeoTable : public CommonSparseTable {
   explicit SparseGeoTable() : CommonSparseTable() { geo_recorder = nullptr; }
   virtual ~SparseGeoTable() {}
 
+  virtual int32_t initialize_value();
+
   int32_t pull_geo_param(const uint32_t trainer_id, std::vector<float>* values,
                          std::vector<uint64_t>* keys);
 
-  virtual int32_t push_sparse(const uint64_t* keys, const float* values,
-                              size_t num) override;
+  int32_t push_sparse(const uint64_t* keys, const float* values,
+                      size_t num) override;
 
   virtual int32_t initialize_recorder() {
     if (!geo_recorder) {
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index 3fcddde787f69..714fbb1e4aa2d 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -272,4 +272,4 @@ void RunGraphSplit() {
   worker_ptr_->finalize_worker();
 }
 
-TEST(RunGraphSplit, Run) { RunGraphSplit(); }
\ No newline at end of file
+TEST(RunGraphSplit, Run) { RunGraphSplit(); }
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index d5abf639c83db..df000011e659a 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -15,7 +15,7 @@ cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulation)
 
 cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
-cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta)
+cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils)
 cc_library(legacy SRCS ${DYGRAPH_LEGACY} DEPS global_utils proto_desc operator pten pten_api op_registry variable_helper memcpy)
 cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
 
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 2582cd3c9df8e..a2683db75e92c 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -32,6 +32,8 @@ class GradNodeAccumulation : public GradNodeBase {
   void RetainGrad(
       const std::function<egr::EagerTensor(const egr::EagerTensor&)>& hook);
 
+  egr::EagerTensor Grad() { return accumulated_grad; }
+
  private:
   egr::EagerTensor accumulated_grad;
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index e422774bf9cf0..02eaa79fc9b28 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -83,7 +83,7 @@ void ScaleAPI(const egr::EagerTensor& x, float scale, float bias,
                       SizeOf(dense_tensor->dtype());
   auto dense_out = std::make_shared<pten::DenseTensor>(
       pten::make_intrusive<paddle::experimental::SharedStorage>(
-          paddle::memory::Alloc(place, bytes_size), 0),
+          paddle::memory::Alloc(place, bytes_size)),
       std::move(tensor_meta));
   // Handle Device Context
   const paddle::platform::Place& expected_kernel_place =
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index a8b3421baac02..7b20ff144a7a7 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -80,7 +80,7 @@ egr::EagerTensor scale(const egr::EagerTensor& x, float scale, float bias,
     scale_node->SetAttributes_scale(scale);
 
     // Set Next Edges
-    scale_node->AddEdges(*p_autograd_in, /*slot id*/ 0);
+    scale_node->AddEdges(p_autograd_in, /*slot id*/ 0);
 
     // Set TensorWrappers
     scale_node->SetTensorWrappers_X({x});
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index f58631e26a815..00578d9a359a3 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -63,7 +63,7 @@ class Controller {
   void SetCurrentTracer(
       const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
     tracer_ = tracer;
-    VLOG(6) << "Set current tracer: " << tracer_;
+    VLOG(6) << "Set current tracer for Controller: " << tracer_;
   }
 
   bool InEagerMode() const { return in_eager_mode_; }
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index 85ff6687e0dbe..7aa1917240f73 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -43,33 +43,37 @@ void RegisterReduceHookForTensor(const egr::EagerTensor& tensor,
 void RetainGradForTensor(const egr::EagerTensor& tensor) {
   // TODO(jiabin): Support More Tensor type here
   AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
-  egr::EagerTensor* grad_tensor = meta->MutableGrad();
+  std::weak_ptr<egr::EagerTensor> weak_grad_tensor = meta->WeakGrad();
 
   // Define Hook
   std::function<egr::EagerTensor(const egr::EagerTensor&)> hook =
-      [grad_tensor](const egr::EagerTensor& t) {
-        if (!grad_tensor) {
-          PADDLE_THROW(paddle::platform::errors::Fatal(
-              "Detected null grad_tensor."
-              "Grad tensor in AutogradMeta of should not be nullptr"));
-        }
-        if (t.defined()) {
-          // Simply Copy impl() to grad_tensor
-          grad_tensor->set_impl(t.impl());
-          return *grad_tensor;
+      [weak_grad_tensor](const egr::EagerTensor& t) {
+        if (!weak_grad_tensor.expired()) {
+          auto grad_tensor = weak_grad_tensor.lock();
+          if (t.defined()) {
+            VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name();
+            // Simply Copy impl() to grad_tensor
+            grad_tensor->set_impl(t.impl());
+            return *grad_tensor.get();
+          } else {
+            VLOG(7) << "Set Var for RetainGrad Hook for tensor: " << t.name();
+            PADDLE_ENFORCE_EQ(
+                t.Var().IsInitialized(), true,
+                paddle::platform::errors::Fatal(
+                    "Detected uninitialized variable, causing segmentation "
+                    "fault "
+                    "inside the hook."
+                    "Variable %s has to be initialized while we need to set it."
+                    "please check tensor initialization status.",
+                    t.name()));
+            grad_tensor->MutableVar()
+                ->GetMutable<paddle::framework::LoDTensor>()
+                ->ShareDataWith(t.Var().Get<paddle::framework::LoDTensor>());
+            return *grad_tensor.get();
+          }
         } else {
-          PADDLE_ENFORCE_EQ(
-              t.Var().IsInitialized(), true,
-              paddle::platform::errors::Fatal(
-                  "Detected uninitialized variable, causing segmentation fault "
-                  "inside the hook."
-                  "Variable %s has to be initialized while we need to set it."
-                  "please check tensor initialization status.",
-                  t.name()));
-          grad_tensor->MutableVar()
-              ->GetMutable<paddle::framework::LoDTensor>()
-              ->ShareDataWith(t.Var().Get<paddle::framework::LoDTensor>());
-          return *grad_tensor;
+          VLOG(7) << "Retain NULL EagerTensor in Grad Hook";
+          return EagerTensor();
         }
       };
 
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index d8f1e32ff012b..010c879571c74 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -16,6 +16,15 @@ execute_process(
     COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generate_file_structures.py" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/"
 )
 
+set(tmp_dygraph_forward_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h")
+set(tmp_dygraph_forward_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.tmp.cc")
+set(tmp_dygraph_node_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h")
+set(tmp_dygraph_node_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.cc")
+set(dygraph_forward_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h")
+set(dygraph_forward_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.cc")
+set(dygraph_node_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h")
+set(dygraph_node_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.cc")
+
 if(WIN32)
     set(EAGER_CODEGEN_DEPS eager_generator)
     if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
@@ -48,6 +57,14 @@ if(WIN32)
 
     add_custom_target(eager_codegen
       COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} ${dygraph_forward_h_path}
+      COMMENT "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} ${dygraph_forward_cc_path}
+      COMMENT "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} ${dygraph_node_h_path}
+      COMMENT "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} ${dygraph_node_cc_path}
+      COMMENT "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}"
       DEPENDS ${EAGER_CODEGEN_DEPS}
       VERBATIM)
 else()
@@ -55,6 +72,14 @@ else()
           COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${CMAKE_CURRENT_BINARY_DIR}/../../pybind"
                 "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" 
                 "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
+          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} ${dygraph_forward_h_path}
+          COMMENT "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}"
+          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} ${dygraph_forward_cc_path}
+          COMMENT "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}"
+          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} ${dygraph_node_h_path}
+          COMMENT "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}"
+          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} ${dygraph_node_cc_path}
+          COMMENT "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}"
           DEPENDS eager_generator
           VERBATIM)
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index c87cda34cee95..2c3207b116e29 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -21,11 +21,14 @@
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/pybind/op_function_generator.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
 
+#define NUM_CREATED_DUP_INPUTS 4
+
 namespace paddle {
 namespace framework {
 
@@ -45,6 +48,62 @@ static std::string LegalizeVariableName(const std::string& var_name) {
   return ret;
 }
 
+static bool IgnoreGradAttribute(const std::string& op_type,
+                                const std::string& attr_name) {
+  // Attributes in operators_with_attrs are created manually during code
+  // generation
+  // We should ignore these arbitrary attrs when setting up grad attribute map
+  if (operators_with_attrs.count(op_type)) {
+    if (operators_with_attrs[op_type].count(attr_name)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static void PrepareAttrMapForOps() {
+  // Handle "run_program_op"
+  static framework::ProgramDesc fake_prog;
+  operators_with_attrs["run_program"] = {};
+  operators_with_attrs["run_program"]["global_block"] =
+      fake_prog.MutableBlock(0);
+
+  // Handle "fused_elemwise_add_activation"
+  std::vector<std::string> functor_list = {"a", "b"};
+  operators_with_attrs["fused_elemwise_add_activation"] = {};
+  operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "fused_elemwise_activation"
+  operators_with_attrs["fused_elemwise_activation"] = {};
+  operators_with_attrs["fused_elemwise_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "reverse"
+  std::vector<int> axis = {0};
+  operators_with_attrs["reverse"] = {};
+  operators_with_attrs["reverse"]["axis"] = axis;
+
+  // Handle "flip"
+  operators_with_attrs["flip"] = {};
+  operators_with_attrs["flip"]["axis"] = axis;
+
+  // Handle "cast"
+  operators_with_attrs["cast"] = {};
+  operators_with_attrs["cast"]["out_dtype"] = 5;
+  operators_with_attrs["cast"]["in_dtype"] = 5;
+
+  // Handle "transfer_dtype"
+  operators_with_attrs["transfer_dtype"] = {};
+  operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
+  operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
+
+  // Handle "c_split"
+  operators_with_attrs["c_split"] = {};
+  operators_with_attrs["c_split"]["nranks"] = 1;
+}
+
 /* --- Helper Objects --- */
 class ForwardGenerationInfo {
  public:
@@ -135,6 +194,13 @@ class GradNodeGenerationInfo {
       return &grad_outs_;
     }
 
+    const paddle::framework::AttributeMap& GetGradAttrs() const {
+      return grad_attrs_;
+    }
+    paddle::framework::AttributeMap* GetMutableGradAttrs() {
+      return &grad_attrs_;
+    }
+
    private:
     std::string op_base_type_;
     std::map<std::string, std::string> grad_outs_slotname_map_;
@@ -146,6 +212,7 @@ class GradNodeGenerationInfo {
     std::map<std::string,
              std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>
         grad_outs_;
+    paddle::framework::AttributeMap grad_attrs_;
   };
 
  public:
@@ -676,27 +743,48 @@ static bool CollectGradInformationFromOpInfo(
   std::map<std::string,
            std::vector<std::shared_ptr<paddle::imperative::VarBase>>>
       ins;
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
-    const std::string& in_name = input.name();
-
-    // Handle dispensable input:
-    // 1. At python level, dispensable input will be detected at Python-C
-    // interface and filled with an empty vector
-    // 2. At C++ level, customers should always pass an empty vector for any
-    // dispensable input
-    // 3. During further lowering, there will always be a placeholder VarBase
-    // in ins/outs no matter whether it's dispensable or not
-    // As a result, we always create input VarBase regardless of its
-    // dispensability.
-
-    // Handle duplicable input: list(VarBase) or VarBase
-    // We dont know the exact number of inputs expected,
-    // but we only need to identify the slot name order,
-    // therefore fill in 1 single input VarBase is enough in this scenario
-    ins[in_name] = {std::shared_ptr<paddle::imperative::VarBase>(
-        new paddle::imperative::VarBase("auto_" + in_name))};
-    ins[in_name][0]->SetOverridedStopGradient(false);
-    ins[in_name][0]->MutableVar()->GetMutable<framework::LoDTensor>();
+
+  if (op_proto.inputs().size() == 1 && op_proto.outputs().size() == 1 &&
+      op_proto.inputs()[0].duplicable() &&
+      !op_proto.outputs()[0].duplicable()) {
+    VLOG(6) << "Handle op with special op_bases: " << op_type;
+    // @special case (sum_op): for ops with single duplicable input and single
+    // non-duplicable output
+    //                         feed in NUM_CREATED_DUP_INPUTS inputs to detect a
+    //                         special scenario.
+    const std::string& in_name = op_proto.inputs()[0].name();
+    ins[in_name] = {};
+    for (size_t i = 0; i < NUM_CREATED_DUP_INPUTS; i++) {
+      ins[in_name].emplace_back(std::shared_ptr<paddle::imperative::VarBase>(
+          new paddle::imperative::VarBase("auto_" + in_name + "_" +
+                                          std::to_string(i))));
+      ins[in_name][i]->SetOverridedStopGradient(false);
+      ins[in_name][i]->MutableVar()->GetMutable<framework::LoDTensor>();
+    }
+  } else {
+    for (const proto::OpProto::Var& input : op_proto.inputs()) {
+      const std::string& in_name = input.name();
+
+      // Handle dispensable input:
+      // 1. At python level, dispensable input will be detected at Python-C
+      // interface and filled with an empty vector
+      // 2. At C++ level, customers should always pass an empty vector for any
+      // dispensable input
+      // 3. During further lowering, there will always be a placeholder VarBase
+      // in ins/outs no matter whether it's dispensable or not
+      // As a result, we always create input VarBase regardless of its
+      // dispensability.
+
+      // Handle duplicable input: list(VarBase) or VarBase
+      // We dont know the exact number of inputs expected,
+      // but we only need to identify the slot name order,
+      // therefore fill in 1 single input VarBase is enough in this scenario
+
+      ins[in_name] = {std::shared_ptr<paddle::imperative::VarBase>(
+          new paddle::imperative::VarBase("auto_" + in_name))};
+      ins[in_name][0]->SetOverridedStopGradient(false);
+      ins[in_name][0]->MutableVar()->GetMutable<framework::LoDTensor>();
+    }
   }
   VLOG(6) << "Prepared Forward Ins Map, size = " << ins.size();
 
@@ -724,7 +812,6 @@ static bool CollectGradInformationFromOpInfo(
     VLOG(6) << "Checking AttributeMap Settings";
     attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
     default_attrs = attr_checker->GetDefaultAttrMap();
-    VLOG(6) << "AttributeMap Checking Passed";
   } else {
     VLOG(6) << "Detected Null Attribute Checker, use empty default_attrs";
   }
@@ -796,13 +883,13 @@ static bool CollectGradInformationFromOpInfo(
     (*op_base_infos)[index].SetOpBaseType(op_base.Type());
   }
 
-  /* ------ Get Grad ins/outs ---- */
-  // In case of multiple OpBase, stitch all the respective ins/outs into one
+  /* ------ Get Grad ins/outs/attrs ---- */
   VLOG(6) << "In function size: " << grad_node->size();
   for (auto iter = grad_node->begin(); iter < grad_node->end(); iter++) {
     int index = std::distance(grad_node->begin(), iter);
     auto* op_base_grad_ins = (*op_base_infos)[index].GetMutableGradIns();
     auto* op_base_grad_outs = (*op_base_infos)[index].GetMutableGradOuts();
+    auto* op_base_grad_attrs = (*op_base_infos)[index].GetMutableGradAttrs();
 
     const paddle::imperative::OpBase& op_base = *iter;
     const std::map<std::string, paddle::imperative::SavedVariableWrapperList>&
@@ -810,6 +897,8 @@ static bool CollectGradInformationFromOpInfo(
     const std::map<std::string, paddle::imperative::SavedVariableWrapperList>&
         g_outs = op_base.GetOutsMap();
 
+    *op_base_grad_attrs = op_base.Attrs();
+
     for (const auto& it : g_ins) {
       if (!op_base_grad_ins->count(it.first))
         (*op_base_grad_ins)[it.first] = {};
@@ -888,7 +977,7 @@ static std::string GenerateGradNodeCreationContent(
     if (input.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
-          "egr::EagerUtils::unsafe_autograd_meta(%s);\n";
+          "egr::EagerUtils::nullable_autograd_meta(%s);\n";
       get_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
@@ -902,7 +991,7 @@ static std::string GenerateGradNodeCreationContent(
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta& %s = "
-          "*egr::EagerUtils::unsafe_autograd_meta(%s);\n";
+          "*egr::EagerUtils::nullable_autograd_meta(%s);\n";
       get_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
     }
@@ -971,11 +1060,16 @@ static std::string GenerateGradNodeCreationContent(
         iter.GetGradInsFwdSlotnameMap();
     for (auto& kv : grad_ins_fwd_slotname_map) {
       const std::string& tensor_wrapper_name = kv.second;
+      std::string full_reserved = "false";
+      if (fwd_outputs_name_pos_map.find(tensor_wrapper_name) ==
+          fwd_outputs_name_pos_map.end()) {
+        full_reserved = "true";
+      }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "    grad_node->SetTensorWrapper%s(%s);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
-                                  tensor_wrapper_name, tensor_wrapper_name);
+          "    grad_node->SetTensorWrapper%s(%s, %s);\n";
+      grad_node_creation_str += paddle::string::Sprintf(
+          SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name,
+          full_reserved);
     }
   }
   grad_node_creation_str += "\n";
@@ -999,11 +1093,10 @@ static std::string GenerateGradNodeCreationContent(
           input_position);
 
       const char* ADD_EDGES_TEMPLATE =
-          "    if(%s) grad_node->AddEdges(*%s, %d);\n";
+          "    if(%s) grad_node->AddEdges(%s, %d);\n";
       grad_node_creation_str +=
           paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name,
                                   input_autograd_name, input_position);
-
     } else {
       compute_require_grad_args += ", &" + input_autograd_name;
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
@@ -1013,7 +1106,7 @@ static std::string GenerateGradNodeCreationContent(
       grad_node_creation_str += paddle::string::Sprintf(
           SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position);
 
-      const char* ADD_EDGES_TEMPLATE = "    grad_node->AddEdges(%s, %d);\n";
+      const char* ADD_EDGES_TEMPLATE = "    grad_node->AddEdges(&%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           ADD_EDGES_TEMPLATE, input_autograd_name, input_position);
     }
@@ -1043,6 +1136,12 @@ static std::string GenerateGradNodeCreationContent(
         "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
     grad_node_creation_str +=
         paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+
+    VLOG(6) << "Generated Call RetainGradForTensor";
+    const char* RETAIN_GRAD_TEMPLATE =
+        "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
+    grad_node_creation_str +=
+        paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
   }
   VLOG(6) << "Generated SetGradIn/OutMeta";
 
@@ -1197,23 +1296,23 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     if (op_passing_outs_map[op_type].count(output_name)) {
       const std::string output_var_name = output_name + "Var";
 
-      // Pass Output from function argument,
+      // Pass Output from function argument(EagerTensor*/vector<EagerTensor*>&),
       // in form of shared_ptr<EagerTensor>/vector<shared_ptr<EagerTensor>>
       if (output.duplicable()) {
         const char* FWD_NUM_ARG_TEMPLATE =
-            ", std::vector<egr::EagerTensor>& %s";
+            ", std::vector<egr::EagerTensor*>& %s";
         std::string arg_str =
             paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
         dygraph_function_args_str += arg_str;
 
       } else {
-        const char* FWD_NUM_ARG_TEMPLATE = ", egr::EagerTensor& %s";
+        const char* FWD_NUM_ARG_TEMPLATE = ", egr::EagerTensor* %s";
         std::string arg_str =
             paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
         dygraph_function_args_str += arg_str;
       }
       const char* FWD_OUTS_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::EagerUtils::TrySyncToVars(&%s) },";
+          "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
       outs_contents_str += paddle::string::Sprintf(
           FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
 
@@ -1315,6 +1414,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         GenerateGradNodeCreationContent(fwd_info, bwd_info);
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
+    // [Generation] Call RetainGradForTensor
     VLOG(6) << "Generated GradNode Creation codes";
   }
 
@@ -1383,6 +1483,261 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   return {fwd_function_str, dygraph_function_declaration_str};
 }
 
+static std::string GenerateSingleOpBase(
+    const std::string& fwd_op_type, const std::string& op_base_type,
+    const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
+    const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
+    const std::vector<proto::OpProto::Var>& in_vars,
+    const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
+    const std::map<std::string, std::string>& grad_ins_grad_slotname_map,
+    const std::map<std::string, std::string>& grad_outs_slotname_map,
+    const std::map<
+        std::string,
+        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
+        grad_ins,
+    const std::map<
+        std::string,
+        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
+        grad_outs,
+    const paddle::framework::AttributeMap& grad_attrs,
+    bool is_op_base_per_duplicable_input, size_t* outs_size) {
+  std::string generated_grad_function_body = "";
+
+  const std::string& ins_name = "ins" + std::to_string(*outs_size);
+  const std::string& outs_name = "outs" + std::to_string(*outs_size);
+  const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size);
+
+  // [Generation] Get Ins Map
+  std::string ins_contents_str = "";
+  for (auto iter : grad_ins) {
+    const std::string& grad_input_name = iter.first;
+
+    if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
+      // Fwd Tensor
+      std::string struct_fwd_input_name =
+          grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
+      const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
+          "{ \"%s\", "
+          "egr::EagerUtils::SyncToVars(egr::EagerUtils::RecoverTensorWrapper("
+          "&"
+          "this->%s, "
+          "nullptr)) },";
+      ins_contents_str +=
+          paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
+                                  grad_input_name, struct_fwd_input_name);
+
+    } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
+      // Fwd Tensor's Grad
+      size_t fwd_output_position = fwd_outputs_name_pos_map.at(
+          grad_ins_grad_slotname_map.at(grad_input_name));
+      const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
+          "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
+      ins_contents_str += paddle::string::Sprintf(
+          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, fwd_output_position);
+
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Detected mismatched slot names."
+          "Unable to find forward slot name that matches %s",
+          grad_input_name));
+    }
+  }
+  if (ins_contents_str.size() > 0)
+    ins_contents_str.pop_back();  // // Remove trailing ","
+
+  const char* BWD_INS_MAP_TEMPLATE =
+      "  std::map<std::string, "
+      "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
+      "%s };\n";
+  std::string ins_map_str =
+      paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
+  generated_grad_function_body += ins_map_str;
+
+  VLOG(6) << "Generated Ins Map";
+
+  // [Generation] Get Outs Map
+  std::unordered_set<std::string> duplicable_input_name_set;
+  for (const auto& in : in_vars) {
+    if (in.duplicable()) duplicable_input_name_set.insert(in.name());
+  }
+
+  std::string outs_contents_str = "";
+  for (auto iter : grad_outs) {
+    const std::string& grad_output_name = iter.first;
+
+    if (grad_outs_slotname_map.count(grad_output_name)) {
+      // Fwd Tensor
+      const std::string& fwd_name = grad_outs_slotname_map.at(grad_output_name);
+
+      /* Handle Special Case: "PullSparseOp", etc
+
+          Forward:
+
+             Ids  W
+              |   |
+           PullSparseOp
+                |
+               Out
+
+          Backward:
+
+             Ids  GradOut  W
+              |      |     |
+             PullSparseGradOp
+                     |
+                  GradOut
+
+          Its grad output "GradOut" corresponds to forward output "Out",
+          where there is a hiden inplace involved. So we find "GradOut"'s
+         index
+         in
+          grads, and perform the inplace operation by constructing outs =
+         {{"Out", grads[i]}}
+
+          GradOut -> Out -> fwd_output_pos -> grads position -> grads[i]
+          outs = {{"Out", grads[i]}}
+
+          For returns, append "GradOut" to the very end of return list.
+      */
+      if (!fwd_inputs_name_pos_map.count(fwd_name)) {
+        PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                       paddle::platform::errors::Fatal(
+                           "fwd_name not found in fwd_inputs_name_pos_map nor "
+                           "fwd_outputs_name_pos_map"));
+
+        size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
+
+        const char* GRAD_OUTS_CONTENT_TEMPLATE =
+            "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
+        outs_contents_str += paddle::string::Sprintf(
+            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);
+
+      } else {
+        size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+        if (duplicable_input_name_set.count(fwd_name) &&
+            !is_op_base_per_duplicable_input) {
+          const char* GRAD_OUTS_CONTENT_TEMPLATE =
+              "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
+              "this->OutputMeta()[%d].Size() ) },";
+          outs_contents_str += paddle::string::Sprintf(
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+        } else {
+          const char* GRAD_OUTS_CONTENT_TEMPLATE =
+              "{ \"%s\", "
+              "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance("
+              ")."
+              "GenerateUniqueName())}},";
+          outs_contents_str += paddle::string::Sprintf(
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+        }
+      }
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Detected mismatched slot names."
+          "Unable to find forward slot name that matches %s",
+          grad_output_name));
+    }
+  }
+  if (outs_contents_str.size() > 0)
+    outs_contents_str.pop_back();  // // Remove trailing ","
+
+  const char* BWD_OUTS_MAP_TEMPLATE =
+      "  std::map<std::string, "
+      "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
+      "%s };\n";
+  std::string outs_map_str = paddle::string::Sprintf(
+      BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
+  generated_grad_function_body += outs_map_str;
+  generated_grad_function_body += "\n";
+
+  VLOG(6) << "Generated Outs Map";
+
+  // [Generation] Get Attrs Map
+  const char* ATTRS_TEMPLATE = "  auto %s = this->attr_map_;\n";
+  std::string grad_attrs_str =
+      paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name);
+  for (const auto& iter : grad_attrs) {
+    if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue;
+    std::pair<std::string, std::string> type_val =
+        GetAttrType(iter.second, false /*is_arg*/);
+    const char* GRAD_ATTRS_TEMPLATE =
+        "  %s %s = %s;\n"
+        "  %s[\"%s\"] = %s;\n";
+    std::string var_name = iter.first + std::to_string(*outs_size);
+    grad_attrs_str += paddle::string::Sprintf(
+        GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second,
+        attrs_name, iter.first, var_name);
+  }
+  generated_grad_function_body += grad_attrs_str;
+
+  const char* TRACE_OP_TEMPLATE =
+      "  // Pass the entire attribute map to TraceOp\n"
+      "  // The underlying kernel will pickup whatever attribute they need "
+      "at runtime\n"
+      "  egr::legacy::RunOp(\"%s\", %s, %s, %s,\n"
+      "      egr::Controller::Instance().GetExpectedPlace(),\n"
+      "      &this->default_attr_map_, false, {});\n";
+  std::string trace_opbase_str = paddle::string::Sprintf(
+      TRACE_OP_TEMPLATE, op_base_type, ins_name, outs_name, attrs_name);
+
+  generated_grad_function_body += trace_opbase_str;
+
+  VLOG(6) << "Generated Attrs Map";
+
+  // [Generation] Get Return
+  std::string outputs_str = "";
+  size_t num_appended_outputs = 0;
+  for (auto iter : grad_outs) {
+    const std::string& grad_out_name = iter.first;
+    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+    if (fwd_inputs_name_pos_map.count(fwd_name)) {
+      size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+      if (!is_op_base_per_duplicable_input) {
+        const char* BWD_OUTPUT_TEMPLATE =
+            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+        outputs_str += paddle::string::Sprintf(
+            BWD_OUTPUT_TEMPLATE, fwd_input_position, outs_name, grad_out_name);
+      } else {
+        const char* BWD_OUTPUT_TEMPLATE =
+            "  "
+            "outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]"
+            ");\n";
+        outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, outs_name,
+                                               grad_out_name);
+      }
+      num_appended_outputs++;
+    } else {
+      PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                     paddle::platform::errors::Fatal(
+                         "fwd_name not found in fwd_inputs_name_pos_map nor "
+                         "fwd_outputs_name_pos_map"));
+    }
+  }
+
+  /* Handle Special Case: "PullSparseOp", etc
+     For returns, append "GradOut" to the very end of return list. */
+  for (auto iter : grad_outs) {
+    const std::string& grad_out_name = iter.first;
+    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+    if (fwd_outputs_name_pos_map.count(fwd_name)) {
+      const char* BWD_OUTPUT_TEMPLATE =
+          "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+      outputs_str += paddle::string::Sprintf(
+          BWD_OUTPUT_TEMPLATE, num_appended_outputs, outs_name, grad_out_name);
+      num_appended_outputs++;
+    }
+  }
+
+  generated_grad_function_body += outputs_str;
+  generated_grad_function_body += "\n";
+
+  *outs_size += grad_outs.size();
+
+  return generated_grad_function_body;
+}
+
 /* ---------------------------------------------- */
 /* --------- CodeGen: GradNode::operator() ------ */
 /* ---------------------------------------------- */
@@ -1396,6 +1751,7 @@ static std::string GenerateGradNodeCCContents(
   const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map =
       fwd_info.GetFwdOutputsNamePosMap();
   const std::vector<proto::OpProto::Var>& in_vars = fwd_info.GetInVars();
+  const std::vector<proto::OpProto::Var>& out_vars = fwd_info.GetOutVars();
 
   VLOG(6) << "Generating Grad Node CC";
 
@@ -1442,9 +1798,26 @@ static std::string GenerateGradNodeCCContents(
   }
   */
 
+  // This is a Copy
+  auto op_base_infos = bwd_info.GetOpBaseInfos();
+
+  /* Special Case: ops such as sum_grad_op is implemented abnormaly,
+                   where it unpacked duplicable GradX and created one OpBase
+                   corresponds to each member of GradX[i]
+     */
+  bool is_op_base_per_duplicable_input = false;
+  if (in_vars.size() == 1 && out_vars.size() == 1 && in_vars[0].duplicable() &&
+      !out_vars[0].duplicable() &&
+      op_base_infos.size() == NUM_CREATED_DUP_INPUTS) {
+    is_op_base_per_duplicable_input = true;
+    // Only keep the first op_base
+    auto op_base_info = op_base_infos[0];
+    op_base_infos.clear();
+    op_base_infos.emplace_back(std::move(op_base_info));
+  }
+
   std::string generated_grad_function_body = "";
   size_t outs_size = 0;
-  const auto& op_base_infos = bwd_info.GetOpBaseInfos();
   for (size_t i = 0; i < op_base_infos.size(); i++) {
     const auto& op_base_info = op_base_infos[i];
 
@@ -1455,216 +1828,23 @@ static std::string GenerateGradNodeCCContents(
     const auto& grad_outs_slotname_map = op_base_info.GetGradOutsSlotnameMap();
     const auto& grad_ins = op_base_info.GetGradIns();
     const auto& grad_outs = op_base_info.GetGradOuts();
+    const auto& grad_attrs = op_base_info.GetGradAttrs();
 
     const std::string& op_base_type = op_base_info.GetOpBaseType();
-    const std::string& ins_name = "ins" + std::to_string(i);
-    const std::string& outs_name = "outs" + std::to_string(i);
-
-    outs_size += grad_outs.size();
-
-    // [Generation] Get Ins Map
-    std::string ins_contents_str = "";
-    for (auto iter : grad_ins) {
-      const std::string& grad_input_name = iter.first;
-
-      if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
-        // Fwd Tensor
-        std::string struct_fwd_input_name =
-            grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
-        const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
-            "{ \"%s\", "
-            "egr::EagerUtils::SyncToVars(egr::EagerUtils::RecoverTensorWrapper("
-            "&"
-            "this->%s, "
-            "nullptr)) },";
-        ins_contents_str +=
-            paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
-                                    grad_input_name, struct_fwd_input_name);
-
-      } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
-        // Fwd Tensor's Grad
-        size_t fwd_output_position = fwd_outputs_name_pos_map.at(
-            grad_ins_grad_slotname_map.at(grad_input_name));
-        const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
-            "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
-        ins_contents_str +=
-            paddle::string::Sprintf(GRAD_INS_GRAD_CONTENT_TEMPLATE,
-                                    grad_input_name, fwd_output_position);
-
-      } else {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Detected mismatched slot names."
-            "Unable to find forward slot name that matches %s",
-            grad_input_name));
-      }
-    }
-    if (ins_contents_str.size() > 0)
-      ins_contents_str.pop_back();  // // Remove trailing ","
-
-    const char* BWD_INS_MAP_TEMPLATE =
-        "  std::map<std::string, "
-        "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
-        "%s };\n";
-    std::string ins_map_str = paddle::string::Sprintf(
-        BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
-    generated_grad_function_body += ins_map_str;
-
-    VLOG(6) << "Generated Ins Map";
-
-    // [Generation] Get Outs Map
-    std::unordered_set<std::string> duplicable_input_name_set;
-    for (const auto& in : in_vars) {
-      if (in.duplicable()) duplicable_input_name_set.insert(in.name());
-    }
-
-    std::string outs_contents_str = "";
-    for (auto iter : grad_outs) {
-      const std::string& grad_output_name = iter.first;
-
-      if (grad_outs_slotname_map.count(grad_output_name)) {
-        // Fwd Tensor
-        const std::string& fwd_name =
-            grad_outs_slotname_map.at(grad_output_name);
-
-        /* Handle Special Case: "PullSparseOp", etc
-
-            Forward:
-
-               Ids  W
-                |   |
-             PullSparseOp
-                  |
-                 Out
-
-            Backward:
-
-               Ids  GradOut  W
-                |      |     |
-               PullSparseGradOp
-                       |
-                    GradOut
-
-            Its grad output "GradOut" corresponds to forward output "Out",
-            where there is a hiden inplace involved. So we find "GradOut"'s
-           index
-           in
-            grads, and perform the inplace operation by constructing outs =
-           {{"Out", grads[i]}}
-
-            GradOut -> Out -> fwd_output_pos -> grads position -> grads[i]
-            outs = {{"Out", grads[i]}}
-
-            For returns, append "GradOut" to the very end of return list.
-        */
-        if (!fwd_inputs_name_pos_map.count(fwd_name)) {
-          PADDLE_ENFORCE(
-              fwd_outputs_name_pos_map.count(fwd_name),
-              paddle::platform::errors::Fatal(
-                  "fwd_name not found in fwd_inputs_name_pos_map nor "
-                  "fwd_outputs_name_pos_map"));
-
-          size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
-
-          const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
-          outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);
-
-        } else {
-          size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
-          if (duplicable_input_name_set.count(fwd_name)) {
-            const char* GRAD_OUTS_CONTENT_TEMPLATE =
-                "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
-                "this->OutputMeta()[%d].Size() ) },";
-            outs_contents_str +=
-                paddle::string::Sprintf(GRAD_OUTS_CONTENT_TEMPLATE,
-                                        grad_output_name, fwd_input_position);
-          } else {
-            const char* GRAD_OUTS_CONTENT_TEMPLATE =
-                "{ \"%s\", "
-                "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance("
-                ")."
-                "GenerateUniqueName())}},";
-            outs_contents_str += paddle::string::Sprintf(
-                GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
-          }
-        }
-      } else {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Detected mismatched slot names."
-            "Unable to find forward slot name that matches %s",
-            grad_output_name));
-      }
-    }
-    if (outs_contents_str.size() > 0)
-      outs_contents_str.pop_back();  // // Remove trailing ","
-
-    const char* BWD_OUTS_MAP_TEMPLATE =
-        "  std::map<std::string, "
-        "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
-        "%s };\n";
-    std::string outs_map_str = paddle::string::Sprintf(
-        BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
-    generated_grad_function_body += outs_map_str;
-    generated_grad_function_body += "\n";
-
-    VLOG(6) << "Generated Outs Map";
-
-    // [Generation] Get Attrs Map
-    const char* TRACE_OP_TEMPLATE =
-        "  // Pass the entire attribute map to TraceOp\n"
-        "  // The underlying kernel will pickup whatever attribute they need "
-        "at runtime\n"
-        "  egr::legacy::RunOp(\"%s\", %s, %s, this->attr_map_,\n"
-        "      egr::Controller::Instance().GetExpectedPlace(),\n"
-        "      &this->default_attr_map_, false, {});\n";
-    std::string trace_opbase_str = paddle::string::Sprintf(
-        TRACE_OP_TEMPLATE, op_base_type, ins_name, outs_name);
-
-    generated_grad_function_body += trace_opbase_str;
-
-    VLOG(6) << "Generated Attrs Map";
-
-    // [Generation] Get Return
-    std::string outputs_str = "";
-    size_t num_appended_outputs = 0;
-    for (auto iter : grad_outs) {
-      const std::string& grad_out_name = iter.first;
-      const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
-
-      if (fwd_inputs_name_pos_map.count(fwd_name)) {
-        size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
-        const char* BWD_OUTPUT_TEMPLATE =
-            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
-        outputs_str += paddle::string::Sprintf(
-            BWD_OUTPUT_TEMPLATE, fwd_input_position, outs_name, grad_out_name);
-        num_appended_outputs++;
-      } else {
-        PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
-                       paddle::platform::errors::Fatal(
-                           "fwd_name not found in fwd_inputs_name_pos_map nor "
-                           "fwd_outputs_name_pos_map"));
-      }
-    }
-
-    /* Handle Special Case: "PullSparseOp", etc
-       For returns, append "GradOut" to the very end of return list. */
-    for (auto iter : grad_outs) {
-      const std::string& grad_out_name = iter.first;
-      const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
-
-      if (fwd_outputs_name_pos_map.count(fwd_name)) {
-        const char* BWD_OUTPUT_TEMPLATE =
-            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
-        outputs_str +=
-            paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, num_appended_outputs,
-                                    outs_name, grad_out_name);
-        num_appended_outputs++;
-      }
-    }
+    generated_grad_function_body += GenerateSingleOpBase(
+        fwd_op_type, op_base_type, fwd_inputs_name_pos_map,
+        fwd_outputs_name_pos_map, in_vars, grad_ins_fwd_slotname_map,
+        grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs,
+        grad_attrs, is_op_base_per_duplicable_input, &outs_size);
+  }
 
-    generated_grad_function_body += outputs_str;
-    generated_grad_function_body += "\n";
+  if (is_op_base_per_duplicable_input) {
+    const char* OP_BASE_PER_DUP_INPUT_TEMPLATE =
+        "  for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n"
+        "    %s\n"
+        "  }\n";
+    generated_grad_function_body = paddle::string::Sprintf(
+        OP_BASE_PER_DUP_INPUT_TEMPLATE, generated_grad_function_body);
   }
 
   const char* BWD_RETURN_TEMPLATE =
@@ -1765,6 +1945,7 @@ static std::string GenerateGradNodeHeaderContents(
 
       std::string tensor_wrapper_arg_str;
       std::string tensor_wrapper_body_str;
+      std::string full_reserved_str = "full_reserved";
       if (duplicable_tensors.count(tensor_wrapper_name)) {
         const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
             "const std::vector<egr::EagerTensor>& %s";
@@ -1797,17 +1978,18 @@ static std::string GenerateGradNodeHeaderContents(
             TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-            "%s = egr::TensorWrapper(%s, true /*full_reserved*/);";
+            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
-            tensor_wrapper_name);
+            tensor_wrapper_name, full_reserved_str);
       }
-
+      std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "   void SetTensorWrapper%s(%s) {\n     %s\n   }\n";
+          "   void SetTensorWrapper%s(%s, %s) {\n     %s\n   }\n";
       set_tensor_wrappers_str += paddle::string::Sprintf(
           SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-          tensor_wrapper_arg_str, tensor_wrapper_body_str);
+          tensor_wrapper_arg_str, full_reserved_signature_str,
+          tensor_wrapper_body_str);
     }
   }
   VLOG(6) << "Generated TensorWrapper";
@@ -1842,19 +2024,15 @@ static std::string GenerateDygraphHFileIncludes() {
   return dygraph_forward_api_includes_str;
 }
 
-static void GenerateForwardHFile(const std::string& output_dir,
+static void GenerateForwardHFile(const std::string& dygraph_forward_api_path,
                                  const std::string& dygraph_forward_api_str) {
-  std::string dygraph_forward_api_path = output_dir + "/dygraph_forward_api.h";
   std::ofstream forward_header_stream(dygraph_forward_api_path, std::ios::out);
   forward_header_stream << dygraph_forward_api_str;
   forward_header_stream.close();
 }
 
-static void GenerateForwardDygraphFile(const std::string& output_dir,
+static void GenerateForwardDygraphFile(const std::string& forward_cc_path,
                                        const std::string& fwd_function_str) {
-  std::string forwards_dir = output_dir + "/forwards/";
-  std::string forward_cc_filename = "dygraph_forward_functions.cc";
-  std::string forward_cc_path = forwards_dir + forward_cc_filename;
   const char* FORWARD_INCLUDE_TEMPLATE =
       "#include "
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
@@ -1871,11 +2049,8 @@ static void GenerateForwardDygraphFile(const std::string& output_dir,
   forward_cc_stream.close();
 }
 
-static void GenerateNodeHFile(const std::string& output_dir,
+static void GenerateNodeHFile(const std::string& node_h_path,
                               const std::string& grad_node_str) {
-  std::string nodes_dir = output_dir + "/nodes/";
-  std::string node_h_filename = "nodes.h";
-  std::string node_h_path = nodes_dir + node_h_filename;
   std::string node_h_include_str =
       "#pragma once\n"
       "#include \"paddle/fluid/eager/tensor_wrapper.h\"\n"
@@ -1887,11 +2062,8 @@ static void GenerateNodeHFile(const std::string& output_dir,
   node_h_stream.close();
 }
 
-static void GenerateNodeCCFile(const std::string& output_dir,
+static void GenerateNodeCCFile(const std::string& node_cc_path,
                                const std::string& grad_function_str) {
-  std::string nodes_dir = output_dir + "/nodes/";
-  std::string node_cc_filename = "nodes.cc";
-  std::string node_cc_path = nodes_dir + node_cc_filename;
   const char* NODE_CC_INCLUDE_TEMPLATE =
       "#include \"glog/logging.h\"\n"
       "#include \"paddle/pten/api/all.h\"\n"
@@ -2021,54 +2193,24 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
   }
 
   VLOG(6) << "-------- GenerateDygraphForwardCCFile -------";
+  std::string forward_cc_path =
+      output_dir + "/forwards/dygraph_forward_functions.tmp.cc";
   fwd_function_str += "\n";
   fwd_function_str += GenerateCoreOpsReturnsInfo();
-  GenerateForwardDygraphFile(output_dir, fwd_function_str);
+  GenerateForwardDygraphFile(forward_cc_path, fwd_function_str);
 
   VLOG(6) << "-------- GenerateForwardHFile -------";
-  GenerateForwardHFile(output_dir, dygraph_forward_api_str);
+  std::string dygraph_forward_api_path =
+      output_dir + "/dygraph_forward_api.tmp.h";
+  GenerateForwardHFile(dygraph_forward_api_path, dygraph_forward_api_str);
 
   VLOG(6) << "-------- GenerateNodeHFile -------";
-  GenerateNodeHFile(output_dir, grad_node_h_str);
+  std::string node_h_path = output_dir + "/nodes/nodes.tmp.h";
+  GenerateNodeHFile(node_h_path, grad_node_h_str);
 
   VLOG(6) << "-------- GenerateNodeCCFile -------";
-  GenerateNodeCCFile(output_dir, grad_node_cc_str);
-}
-
-static void PrepareAttrMapForOps() {
-  // Handle "fused_elemwise_add_activation"
-  std::vector<std::string> functor_list = {"a", "b"};
-  operators_with_attrs["fused_elemwise_add_activation"] = {};
-  operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] =
-      functor_list;
-
-  // Handle "fused_elemwise_activation"
-  operators_with_attrs["fused_elemwise_activation"] = {};
-  operators_with_attrs["fused_elemwise_activation"]["functor_list"] =
-      functor_list;
-
-  // Handle "reverse"
-  std::vector<int> axis = {0};
-  operators_with_attrs["reverse"] = {};
-  operators_with_attrs["reverse"]["axis"] = axis;
-
-  // Handle "flip"
-  operators_with_attrs["flip"] = {};
-  operators_with_attrs["flip"]["axis"] = axis;
-
-  // Handle "cast"
-  operators_with_attrs["cast"] = {};
-  operators_with_attrs["cast"]["out_dtype"] = 5;
-  operators_with_attrs["cast"]["in_dtype"] = 5;
-
-  // Handle "transfer_dtype"
-  operators_with_attrs["transfer_dtype"] = {};
-  operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
-  operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
-
-  // Handle "c_split"
-  operators_with_attrs["c_split"] = {};
-  operators_with_attrs["c_split"]["nranks"] = 1;
+  std::string node_cc_path = output_dir + "/nodes/nodes.tmp.cc";
+  GenerateNodeCCFile(node_cc_path, grad_node_cc_str);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index 7f46136416752..18156f913de78 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/grad_node_info.h"
-
 namespace egr {
 
 using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta;
@@ -75,9 +75,20 @@ class AutogradMeta : public AbstractAutogradMeta {
 
   ~AutogradMeta() override = default;
 
-  const egr::EagerTensor& Grad() const { return grad_; }
+  const egr::EagerTensor& Grad() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        grad_.get(),
+        paddle::platform::errors::InvalidArgument(
+            "Should Not get NULL from Grad pointer, since "
+            "we should have default EagerTensor once we init AutoGradMeta. "
+            "if you got this error may indicates framework error in "
+            "PaddlePaddle"));
+    return *(grad_.get());
+  }
+
+  egr::EagerTensor* MutableGrad() { return grad_.get(); }
 
-  egr::EagerTensor* MutableGrad() { return &grad_; }
+  std::weak_ptr<egr::EagerTensor> WeakGrad() { return grad_; }
 
   void SetGradNode(const std::shared_ptr<GradNodeBase>& grad_node) {
     PADDLE_ENFORCE_NOT_NULL(
@@ -120,14 +131,19 @@ class AutogradMeta : public AbstractAutogradMeta {
 
   void SetPersistable(bool persistable) { persistable_ = persistable; }
 
+  bool RetainGrads() { return retain_grads_; }
+
+  void SetRetainGrads(bool value) { retain_grads_ = value; }
+
  private:
   // TODO(jiabin) :Should we use pointer instead of object?
-  egr::EagerTensor grad_;
+  std::shared_ptr<egr::EagerTensor> grad_{std::make_shared<egr::EagerTensor>(
+      egr::Controller::Instance().GenerateUniqueName("@grad"))};
 
   // GradNodeBase is base class of all grad op which is a
   // wrapper for grad op. This class will make grad op easy
   // to be traced.
-  std::shared_ptr<GradNodeBase> grad_node_;
+  std::shared_ptr<GradNodeBase> grad_node_ = nullptr;
 
   /**
    * Why we need slot id here?
@@ -149,6 +165,8 @@ class AutogradMeta : public AbstractAutogradMeta {
 
   bool persistable_{false};
 
+  bool retain_grads_{false};
+
   // TODO(jiabin) :Support Quantum here and add cache mechanism as
   // VarCache defined in VarBase
 };
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index bee7124b55cd9..9a760c03728cd 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -71,6 +71,14 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
   return node_in_degree_map;
 }
 
+void RunBackwardHooks(
+    const std::vector<std::vector<egr::EagerTensor>>& grad_tensors,
+    egr::GradNodeBase* grad_node) {
+  grad_node->ApplyGradientHooks(grad_tensors);
+  VLOG(6) << "Apply Reduce Hooks for node";
+  grad_node->ApplyReduceHooks();
+}
+
 void RunBackward(const std::vector<egr::EagerTensor>& tensors,
                  const std::vector<egr::EagerTensor>& grad_tensors,
                  bool retain_graph) {
@@ -157,7 +165,11 @@ void RunBackward(const std::vector<egr::EagerTensor>& tensors,
     std::unique_ptr<GradTensorHolder> node_input_buffer =
         std::move(node_input_buffers_dict[node]);
     VLOG(6) << "Run Backward Kernel with input_buffer";
-    // Run Backward Node and get outputs
+
+    RunBackwardHooks(node_input_buffer->Buffers(), node);
+    // TODO(jiabin): Support post hook here and make hook run in seperate
+    // operator
+    // Run Pre Backward Node and get outputs
     std::vector<std::vector<egr::EagerTensor>> grad_output_tensors =
         (*node)(node_input_buffer->Buffers());
     // TODO(jiabin): Should we erase it or find a more efficient way.
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 1a3a5a7f9ee0a..0bcef2253f993 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -234,8 +234,7 @@ class EagerTensor final {
           auto* framework_tensor =
               var_.GetMutable<paddle::framework::LoDTensor>();
           framework_tensor->Resize(tensor_->dims());
-          framework_tensor->set_layout(
-              pten::TransToFluidDataLayout(tensor_->layout()));
+          framework_tensor->set_layout(tensor_->layout());
           // Contruct framework::Tensor from egr::EagerTensor
           auto tensor_dense =
               std::dynamic_pointer_cast<pten::DenseTensor>(tensor_->impl());
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index a1c25f6766a53..0e6f6aa63dd0f 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -35,6 +36,33 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
   adj_edges_.resize(bwd_out_slot_num);
 }
 
+void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
+  PADDLE_ENFORCE_LT(
+      slot_id, adj_edges_.size(),
+      paddle::platform::errors::InvalidArgument(
+          "Given slot id is out of range of adj_edges outter size, "
+          "adj_edges is designed to has the same size of grad "
+          "inputs's slot num."));
+  for (const auto& meta : *metas) {
+    // adj_edges has as same rank as fwd inputs, and record it's output rank
+    // from
+    // its pre-ops
+    if (meta) {
+      auto node = meta->GetMutableGradNode();
+      if (node) {
+        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                         meta->OutRankInfo());
+      } else {
+        if (!meta->StopGradient()) {
+          meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+          adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                           meta->OutRankInfo());
+        }
+      }
+    }
+  }
+}
+
 void GradNodeBase::AddEdges(const std::vector<AutogradMeta*>& metas,
                             size_t slot_id) {
   PADDLE_ENFORCE_LT(
@@ -47,20 +75,42 @@ void GradNodeBase::AddEdges(const std::vector<AutogradMeta*>& metas,
     // adj_edges has as same rank as fwd inputs, and record it's output rank
     // from
     // its pre-ops
-    adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                     meta->OutRankInfo());
+    if (meta) {
+      auto node = meta->GetMutableGradNode();
+      if (node) {
+        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                         meta->OutRankInfo());
+      } else {
+        if (!meta->StopGradient()) {
+          meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+          adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                           meta->OutRankInfo());
+        }
+      }
+    }
   }
 }
 
-void GradNodeBase::AddEdges(const AutogradMeta& meta, size_t slot_id) {
+void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
   PADDLE_ENFORCE_LT(
       slot_id, adj_edges_.size(),
       paddle::platform::errors::InvalidArgument(
           "Given slot id is out of range of adj_edges outter size, "
           "adj_edges is designed to has the same size of grad "
           "inputs's slot num."));
-  adj_edges_[slot_id].emplace_back(meta.GetMutableGradNode(),
-                                   meta.OutRankInfo());
+  if (meta) {
+    auto node = meta->GetMutableGradNode();
+    if (node) {
+      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                       meta->OutRankInfo());
+    } else {
+      if (!meta->StopGradient()) {
+        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                         meta->OutRankInfo());
+      }
+    }
+  }
 }
 
 const std::vector<GradSlotMeta>& GradNodeBase::InputMeta() const {
@@ -89,6 +139,11 @@ void GradNodeBase::SetGradInMeta(const std::vector<AutogradMeta*>& fwd_out,
   // Init stop gradient vector before use to avoid push back
   meta.Init(slot_size);
   for (size_t i = 0; i < slot_size; i++) {
+    PADDLE_ENFORCE_NOT_NULL(fwd_out[i],
+                            paddle::platform::errors::PreconditionNotMet(
+                                "Bwd_in_meta should only be called while "
+                                "autograd_meta is not null. If you got this "
+                                "error, it indicates bugs in framework."));
     if (fwd_out[i]->StopGradient()) {
       // Set Stop Gradient only when its true or non-initialized autograd_meta,
       // since all default value is false.
@@ -135,6 +190,10 @@ void GradNodeBase::SetGradOutMeta(const std::vector<AutogradMeta*>& fwd_in,
   // Init stop gradient vector before use to avoid push back
   meta.Init(slot_size);
   for (size_t i = 0; i < slot_size; i++) {
+    if (!fwd_in[i]) {
+      meta.SetStopGradient(i, true);
+      continue;
+    }
     if (fwd_in[i]->StopGradient()) {
       // Set Stop Gradient only when its true or non-initialized autograd_meta,
       // since all default value is false.
@@ -211,6 +270,7 @@ std::vector<std::vector<egr::EagerTensor>> GradNodeBase::ApplyGradientHooks(
     slot_out.resize(tensors[slot_id].size());
     egr::EagerTensor& out = slot_out[rank];
     if (!out.defined() || !out.initialized()) {
+      VLOG(8) << "Run Hook for tensor: " << tensors[slot_id][rank].name();
       out = hook(tensors[slot_id][rank]);
     } else {
       // TODO(jiabin): Why this?
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 6a4053e837894..545b577f4bda9 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -105,8 +105,9 @@ class GradNodeBase {
    *
    * This one is called slot by slot
    * **/
+  void AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id);
   void AddEdges(const std::vector<AutogradMeta*>& metas, size_t slot_id);
-  void AddEdges(const AutogradMeta& meta, size_t slot_id);
+  void AddEdges(AutogradMeta* meta, size_t slot_id);
 
   /**
    * GetEdges is designed to get all edges of current node**/
diff --git a/paddle/fluid/eager/legacy/infer_var_type_context.h b/paddle/fluid/eager/legacy/infer_var_type_context.h
index 8e7bbef37d805..2d5a8d806fee7 100644
--- a/paddle/fluid/eager/legacy/infer_var_type_context.h
+++ b/paddle/fluid/eager/legacy/infer_var_type_context.h
@@ -153,7 +153,8 @@ class TensorRuntimeInferVarTypeContext
 
   paddle::framework::proto::VarType::Type GetOutputType(
       const std::string& name, const int& index = 0) const override {
-    return paddle::framework::ToVarType(outputs_.at(name)[index]->Var().Type());
+    // TODO(jiabin): Support SelectedRows when we have it.
+    return paddle::framework::proto::VarType::LOD_TENSOR;
   }
 
   paddle::framework::proto::VarType::Type GetInputDataType(
diff --git a/paddle/fluid/eager/legacy/op_runner.cc b/paddle/fluid/eager/legacy/op_runner.cc
index 027dc6ee1cba2..4dab96c53eca4 100644
--- a/paddle/fluid/eager/legacy/op_runner.cc
+++ b/paddle/fluid/eager/legacy/op_runner.cc
@@ -37,6 +37,7 @@ void OpRunImpl(const paddle::framework::OperatorBase& op,
                const paddle::framework::AttributeMap& attrs,
                const paddle::framework::AttributeMap& default_attrs,
                const paddle::platform::Place& place) {
+  VLOG(6) << "Get Opertor With Kernel";
   auto* op_kernel =
       dynamic_cast<const paddle::framework::OperatorWithKernel*>(&op);
   PADDLE_ENFORCE_NOT_NULL(
@@ -44,11 +45,13 @@ void OpRunImpl(const paddle::framework::OperatorBase& op,
                      "Only support operator with kernel in Dygraph mode."));
   auto& info = op.Info();
   if (info.infer_var_type_) {
+    VLOG(6) << "Run InferVarType";
     egr::legacy::TensorRuntimeInferVarTypeContext infer_var_type_ctx(
         ins, outs, attrs, default_attrs);
+    VLOG(9) << "Actual Run InferVarType";
     info.infer_var_type_(&infer_var_type_ctx);
   }
-
+  VLOG(6) << "Initialize output tensor";
   // Initialize output tensor
   for (auto& tensor_pair : outs) {
     for (auto& tensor : tensor_pair.second) {
@@ -77,10 +80,13 @@ void OpRunImpl(const paddle::framework::OperatorBase& op,
    * after the execution of op, but the original input is directly
    * overwritten in the previous dynamic graph implemention.
    */
+  VLOG(6) << "Prepare Op";
   auto prepared_op = egr::legacy::PreparedOp::Prepare(
       ins, outs, *op_kernel, place, attrs, default_attrs);
+  VLOG(6) << "Prepare Data";
   auto tmp_ins_ptr =
       egr::legacy::PrepareData(*op_kernel, ins, prepared_op.kernel_type());
+  VLOG(6) << "Run Prepared Op";
   if (tmp_ins_ptr == nullptr) {
     prepared_op.Run(ins, outs, attrs, default_attrs);
   } else {
@@ -130,6 +136,7 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
   }
 
   auto amp_level = egr::Controller::Instance().GetAMPLevel();
+  VLOG(6) << "Check AMP status";
   NameTensorMap new_ins = ins;
   if (amp_level == paddle::imperative::AmpLevel::O1) {
     VLOG(5) << "Auto mixed precision run operator: " << type;
@@ -140,6 +147,7 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
   }
 
   try {
+    VLOG(6) << "Get Device id";
     if (paddle::platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       paddle::platform::SetDeviceId(
@@ -165,7 +173,7 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
           "PaddlePaddle should compile with NPU if use NPUPlace."));
 #endif
     }
-
+    VLOG(6) << "Step in OpRunImpl";
     OpRunImpl(*op, new_ins, outs, attrs, *default_attrs, place);
   } catch (paddle::platform::EnforceNotMet& exception) {
     paddle::framework::AppendErrorOpHint(type, &exception);
@@ -182,7 +190,7 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
     PADDLE_THROW(paddle::platform::errors::Fatal(
         "Operator %s raises an unknown exception.", type));
   }
-
+  VLOG(6) << "Finish Run Op";
   // TODO(jiabin): Support this later
   // if (enable_program_desc_tracing_) {
   //   VLOG(5) << "Trace op " << type << " into ProgramDesc";
diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc
index 547ee86967491..4e892b14a9c9c 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ b/paddle/fluid/eager/legacy/prepared_operator.cc
@@ -76,6 +76,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
                        const paddle::platform::Place& place,
                        const paddle::framework::AttributeMap& attrs,
                        const paddle::framework::AttributeMap& default_attrs) {
+  VLOG(6) << "Preparing an Op";
   paddle::platform::DeviceContextPool& pool =
       paddle::platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
@@ -146,7 +147,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
   if (!(expected_kernel_key.place_ == place)) {
     dev_ctx = pool.Get(expected_kernel_key.place_);
   }
-
+  VLOG(6) << "Construct Prepared Op";
   return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
 }
 
@@ -168,12 +169,12 @@ static void PreparedOpRunImpl(
     const NameTensorMap& outs, const paddle::framework::AttributeMap& attrs,
     const paddle::framework::AttributeMap& default_attrs) {
   // TODO(zjl): remove scope in dygraph
+  VLOG(6) << "Runing Prepared Op";
   paddle::framework::Scope scope;
 
   EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
                                          op.Type());
-  static_cast<const paddle::framework::OperatorWithKernel&>(op).InferShape(
-      &infer_shape_ctx);
+  op.Info().infer_shape_(&infer_shape_ctx);
 
   func(EagerExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs,
                              default_attrs));
@@ -198,6 +199,7 @@ static void PreparedOpRunImpl(
   if (paddle::framework::IsComplexType(kernel_type.data_type_)) {
     HandleComplexGradToRealGrad(outs);
   }
+  VLOG(6) << "Finish Runing Prepared Op";
 }
 
 void PreparedOp::Run(const NameTensorMap& ins, const NameTensorMap& outs,
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index abc200f7130ff..aebb0553e28b6 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -58,7 +58,7 @@ TEST(GradNodeInfo, GradNodeBase) {
   auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
   egr::Edge edge1(grad_test_node1, 3, 4);
   auto auto_grad1 = std::make_shared<egr::AutogradMeta>(edge1);
-  grad_test_node0->AddEdges((*auto_grad0.get()), 0);
+  grad_test_node0->AddEdges(auto_grad0.get(), 0);
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first,
            size_t(1));
   CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second,
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index e292844c8ee58..1fef0905b4cc5 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -41,7 +41,7 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
       paddle::framework::product(t_dense->dims()) * SizeOf(t_dense->dtype());
   auto ret_dense = std::make_shared<pten::DenseTensor>(
       pten::make_intrusive<paddle::experimental::SharedStorage>(
-          paddle::memory::Alloc(place, bytes_size), 0),
+          paddle::memory::Alloc(place, bytes_size)),
       std::move(ret_meta));
 
   float* t_ptr = t_dense->mutable_data<float>();
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 32b28d8efd21b..4ec49bfa56676 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -42,7 +42,7 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
       paddle::framework::product(t_dense->dims()) * SizeOf(t_dense->dtype());
   auto ret_dense = std::make_shared<pten::DenseTensor>(
       pten::make_intrusive<paddle::experimental::SharedStorage>(
-          paddle::memory::Alloc(place, bytes_size), 0),
+          paddle::memory::Alloc(place, bytes_size)),
       std::move(ret_meta));
 
   float* t_ptr = t_dense->mutable_data<float>();
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 2e52753bcc257..e73dfa2ec8b6e 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
 #include "paddle/fluid/eager/tensor_wrapper.h"
 
 #include "paddle/pten/api/all.h"
@@ -24,6 +25,9 @@
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
+PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, true,
+                            "retain grad for all tensor");
+
 namespace egr {
 /**
  * Implementation of Eager Utils.
@@ -50,8 +54,9 @@ AutogradMeta* EagerUtils::unsafe_autograd_meta(const egr::EagerTensor& target) {
 std::vector<AutogradMeta*> EagerUtils::unsafe_autograd_meta(
     const std::vector<egr::EagerTensor>& targets) {
   std::vector<AutogradMeta*> metas;
+  metas.reserve(targets.size());
   for (const egr::EagerTensor& t : targets) {
-    metas.push_back(unsafe_autograd_meta(t));
+    metas.emplace_back(unsafe_autograd_meta(t));
   }
   return metas;
 }
@@ -64,6 +69,16 @@ AutogradMeta* EagerUtils::nullable_autograd_meta(
   return static_cast<AutogradMeta*>(p_autograd_meta);
 }
 
+std::vector<AutogradMeta*> EagerUtils::nullable_autograd_meta(
+    const std::vector<egr::EagerTensor>& targets) {
+  std::vector<AutogradMeta*> metas;
+  metas.reserve(targets.size());
+  for (const egr::EagerTensor& t : targets) {
+    metas.emplace_back(nullable_autograd_meta(t));
+  }
+  return metas;
+}
+
 std::vector<AutogradMeta*> EagerUtils::multi_autograd_meta(
     std::vector<egr::EagerTensor>* targets) {
   std::vector<AutogradMeta*> ret;
@@ -140,7 +155,8 @@ static std::shared_ptr<egr::EagerTensor> TrySyncToVar(
   if (tensor->initialized() || tensor->Var().IsInitialized()) {
     tensor->SyncToVar(paddle::framework::proto::VarType_Type_LOD_TENSOR);
   }
-  return std::make_shared<EagerTensor>(*tensor);
+  return std::shared_ptr<egr::EagerTensor>(tensor,
+                                           [&](egr::EagerTensor* ptr) {});
 }
 
 std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
@@ -159,6 +175,17 @@ std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
   return res;
 }
 
+std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
+    const std::vector<egr::EagerTensor*>& tensors) {
+  std::vector<std::shared_ptr<EagerTensor>> res;
+  size_t num = tensors.size();
+  res.reserve(num);
+  for (size_t i = 0; i < num; i++) {
+    res.emplace_back(TrySyncToVar(tensors[i]));
+  }
+  return res;
+}
+
 /* ---- VarBase -> Tensor ---- */
 std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::SyncToTensors(
     const egr::EagerTensor& tensor) {
@@ -236,4 +263,22 @@ std::vector<EagerTensor> EagerUtils::RecoverTensorWrapper(
   return ret;
 }
 
+void EagerUtils::CheckAndRetainGrad(const egr::EagerTensor& tensor) {
+  VLOG(6) << "Check RetainGradForTensor: " << tensor.name();
+  if (FLAGS_retain_grad_for_all_tensor) {
+    VLOG(6) << "RetainGradForTensor: " << tensor.name();
+    egr::egr_utils_api::RetainGradForTensor(tensor);
+  }
+}
+
+void EagerUtils::CheckAndRetainGrad(
+    const std::vector<egr::EagerTensor>& tensors) {
+  if (FLAGS_retain_grad_for_all_tensor) {
+    for (auto& tensor : tensors) {
+      VLOG(6) << "RetainGradForTensor: " << tensor.name();
+      egr::egr_utils_api::RetainGradForTensor(tensor);
+    }
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 843b6404af555..bc1acbd69d049 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -116,6 +116,8 @@ class EagerUtils {
 
   // This method will return an AutogradMeta pointer unsafely.
   static AutogradMeta* nullable_autograd_meta(const egr::EagerTensor& target);
+  static std::vector<AutogradMeta*> nullable_autograd_meta(
+      const std::vector<egr::EagerTensor>& targets);
   static AutogradMeta* unsafe_autograd_meta(const egr::EagerTensor& target);
   static std::vector<AutogradMeta*> unsafe_autograd_meta(
       const std::vector<egr::EagerTensor>& targets);
@@ -149,6 +151,8 @@ class EagerUtils {
       egr::EagerTensor* tensor);
   static std::vector<std::shared_ptr<egr::EagerTensor>> TrySyncToVars(
       std::vector<egr::EagerTensor>* tensors);
+  static std::vector<std::shared_ptr<egr::EagerTensor>> TrySyncToVars(
+      const std::vector<egr::EagerTensor*>& tensors);
 
   static std::vector<std::shared_ptr<egr::EagerTensor>> SyncToVars(
       const egr::EagerTensor& tensor);
@@ -163,6 +167,9 @@ class EagerUtils {
   static std::vector<egr::EagerTensor> GetOutputs(
       const std::vector<std::shared_ptr<EagerTensor>>& outs);
   static egr::EagerTensor GetOutput(const std::shared_ptr<EagerTensor>& outs);
+
+  static void CheckAndRetainGrad(const egr::EagerTensor& tensor);
+  static void CheckAndRetainGrad(const std::vector<egr::EagerTensor>& tensors);
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index d8051e1fbb116..902943d14ff9d 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -68,15 +68,15 @@ cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
   if (WIN32)
     windows_symbolic(tensor_util SRCS tensor_util.cu)
-    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context dense_tensor)
     add_dependencies(tensor tensor_util)
   else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler dense_tensor)
   endif(WIN32)
 elseif(WITH_ROCM)
-  hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
+  hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler dense_tensor)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler dense_tensor)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -91,15 +91,16 @@ endif()
 cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor)
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
+cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context place memory)
 
 if(WITH_GPU)
-  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
 elseif(WITH_ROCM)
-  hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+  hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor)
 else()
-  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
+  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS mixed_vector place memory device_context tensor)
 endif()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version)
 
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 
@@ -274,7 +275,11 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 
+if (TENSORRT_FOUND)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper tensorrt_engine_op)
+else()
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+endif(TENSORRT_FOUND)
 
 cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector op_registry while_op_helper recurrent_op_helper conditional_block_op_helper)
 if(WITH_DISTRIBUTE)
diff --git a/paddle/fluid/framework/blocking_queue.h b/paddle/fluid/framework/blocking_queue.h
index 4f35da402f3ec..04937fa6b97b3 100644
--- a/paddle/fluid/framework/blocking_queue.h
+++ b/paddle/fluid/framework/blocking_queue.h
@@ -75,6 +75,22 @@ class BlockingQueue {
     return ret;
   }
 
+  void PopAll(std::deque<T> *empty_queue) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [this] { return !q_.empty(); });
+    std::swap(*empty_queue, q_);
+  }
+
+  std::deque<T> PopAll() {
+    std::deque<T> ret;
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      cv_.wait(lock, [this] { return !q_.empty(); });
+      std::swap(ret, q_);
+    }
+    return ret;
+  }
+
   T Pop() {
     std::unique_lock<std::mutex> lock(mutex_);
     cv_.wait(lock, [=] { return !q_.empty(); });
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 0b1fdc3944689..14bef7fe023f6 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -77,8 +77,8 @@ static bool CopySameTensorTestMain(const DDim &dims,
     TensorCopySync(src_tensor, platform::CPUPlace(), &dst_cpu_tensor);
   }
 
-  const void *ground_truth_ptr = src_cpu_tensor.data<void>();
-  const void *result_ptr = dst_cpu_tensor.data<void>();
+  const void *ground_truth_ptr = src_cpu_tensor.data();
+  const void *result_ptr = dst_cpu_tensor.data();
   size_t byte_num = product(dims) * sizeof(T);
   return std::memcmp(ground_truth_ptr, result_ptr, byte_num) == 0;
 }
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 572e36be9b776..fd2522b0336ff 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -94,7 +94,7 @@ std::vector<std::string> ParseAttrStr(const std::string& attr) {
   // 2. type
   rlt.emplace_back(string::trim_spaces(attr.substr(split_pos + 1)));
 
-  VLOG(1) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
+  VLOG(3) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
 
   return rlt;
 }
@@ -109,11 +109,11 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                           const std::vector<std::string>& inputs,
                           const std::vector<std::string>& outputs,
                           const std::vector<std::string>& attrs) {
-  VLOG(1) << "Custom Operator: Start run KernelFunc.";
+  VLOG(3) << "Custom Operator: Start run KernelFunc.";
   std::vector<paddle::experimental::Tensor> custom_ins;
   std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
   for (auto& in_name : inputs) {
-    VLOG(1) << "Custom Operator: input name - " << in_name;
+    VLOG(3) << "Custom Operator: input name - " << in_name;
     if (detail::IsDuplicableVar(in_name)) {
       // return const std::vector<const Tensor*>
       auto vec_x = ctx.MultiInput<Tensor>(in_name);
@@ -185,11 +185,11 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
     }
   }
 
-  VLOG(1) << "Custom Operator: Run ComputeFunc.";
+  VLOG(3) << "Custom Operator: Run ComputeFunc.";
   try {
     auto outs = func(custom_ins, custom_vec_ins, custom_attrs);
 
-    VLOG(1) << "Custom Operator: Share outputs into ExecutionContext.";
+    VLOG(3) << "Custom Operator: Share outputs into ExecutionContext.";
     for (size_t i = 0; i < outputs.size(); ++i) {
       auto out_name = outputs[i];
       if (detail::IsDuplicableVar(out_name)) {
@@ -207,14 +207,14 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                 "Tensors.",
                 vec_true_outs.size(), outs.size()));
         for (size_t j = 0; j < vec_true_outs.size(); ++j) {
-          experimental::MovesSharedStorage(
+          experimental::SharesStorage(
               std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(j).impl())
                   .get(),
               vec_true_outs.at(j));
         }
       } else {
         auto* true_out = ctx.Output<Tensor>(out_name);
-        experimental::MovesSharedStorage(
+        experimental::SharesStorage(
             std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(i).impl())
                 .get(),
             true_out);
@@ -230,6 +230,95 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
   }
 }
 
+static void RunInferShapeFunc(framework::InferShapeContext* ctx,
+                              const paddle::InferShapeFunc& func,
+                              const std::vector<std::string>& inputs,
+                              const std::vector<std::string>& outputs,
+                              const std::vector<std::string>& attrs) {
+  std::vector<std::vector<int64_t>> input_shapes;
+  std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes;
+
+  VLOG(3) << "Custom Operator: InferShape - get input ddim.";
+  for (auto& in_name : inputs) {
+    if (detail::IsDuplicableVar(in_name)) {
+      OP_INOUT_CHECK(ctx->HasInputs(in_name), "Input", in_name, "Custom");
+      auto vec_ddim = ctx->GetInputsDim(in_name);
+      std::vector<std::vector<int64_t>> vec_shape;
+      vec_shape.reserve(vec_ddim.size());
+      std::transform(vec_ddim.begin(), vec_ddim.end(),
+                     std::back_inserter(vec_shape),
+                     [&](const DDim& ddim) -> std::vector<int64_t> {
+                       return framework::vectorize(ddim);
+                     });
+      vec_input_shapes.emplace_back(vec_shape);
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
+      auto ddim = ctx->GetInputDim(in_name);
+      input_shapes.emplace_back(framework::vectorize(ddim));
+    }
+  }
+
+  std::vector<paddle::any> custom_attrs;
+  for (auto& attr_str : attrs) {
+    auto attr_name_and_type = detail::ParseAttrStr(attr_str);
+    auto attr_name = attr_name_and_type[0];
+    auto attr_type_str = attr_name_and_type[1];
+    if (attr_type_str == "bool") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<bool>(attr_name));
+    } else if (attr_type_str == "int") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<int>(attr_name));
+    } else if (attr_type_str == "float") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<float>(attr_name));
+    } else if (attr_type_str == "int64_t") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<int64_t>(attr_name));
+    } else if (attr_type_str == "std::string") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<std::string>(attr_name));
+    } else if (attr_type_str == "std::vector<int>") {
+      custom_attrs.emplace_back(ctx->Attrs().Get<std::vector<int>>(attr_name));
+    } else if (attr_type_str == "std::vector<float>") {
+      custom_attrs.emplace_back(
+          ctx->Attrs().Get<std::vector<float>>(attr_name));
+    } else if (attr_type_str == "std::vector<int64_t>") {
+      // NOTE(chenweihang): InferShape can't support std::vector<int64_t>
+      // attr type, because the input type is std::vector<int64_t>, only
+      // can use one rule to parse std::vector<int64_t> parameter
+      continue;
+    } else if (attr_type_str == "std::vector<std::string>") {
+      custom_attrs.emplace_back(
+          ctx->Attrs().Get<std::vector<std::string>>(attr_name));
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported `%s` type value as custom attribute now. "
+          "Supported data types include `bool`, `int`, `float`, "
+          "`int64_t`, `std::string`, `std::vector<int>`, "
+          "`std::vector<float>`, `std::vector<std::string>`, "
+          "Please check whether the attribute data type and "
+          "data type string are matched.",
+          attr_type_str));
+    }
+  }
+
+  VLOG(3) << "Custom Operator: InferShape - calc output ddim.";
+  auto output_shapes = func(input_shapes, vec_input_shapes, custom_attrs);
+
+  VLOG(3) << "Custom Operator: InferShape - set output ddim.";
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto out_name = outputs[i];
+    if (detail::IsDuplicableVar(out_name)) {
+      std::vector<DDim> vec_ddim;
+      vec_ddim.reserve(output_shapes.size());
+      std::transform(output_shapes.begin(), output_shapes.end(),
+                     std::back_inserter(vec_ddim),
+                     [&](const std::vector<int64_t>& shape) -> DDim {
+                       return framework::make_ddim(shape);
+                     });
+      ctx->SetOutputsDim(out_name, vec_ddim);
+    } else {
+      ctx->SetOutputDim(out_name, framework::make_ddim(output_shapes[i]));
+    }
+  }
+}
+
 //////////////////// Operator Define /////////////////
 
 class CustomOperator : public OperatorWithKernel {
@@ -239,7 +328,7 @@ class CustomOperator : public OperatorWithKernel {
   // Dummy infershape
   // Because it is a pure virtual function, it must be implemented
   void InferShape(framework::InferShapeContext* ctx) const override {
-    VLOG(1) << "Custom Operator: Dummy infer shape of custom operator.";
+    VLOG(3) << "Custom Operator: Dummy infer shape of custom operator.";
   }
 
   /**
@@ -381,7 +470,7 @@ class CustomGradOpMaker<OpDesc> : public SingleGradOpMaker<OpDesc> {
     auto fwd_op_outputs = this->OutputNames();
 
     for (auto& in_name : inputs_) {
-      VLOG(1) << "Custom Operator: GradOpDescMaker - input: " << in_name;
+      VLOG(3) << "Custom Operator: GradOpDescMaker - input: " << in_name;
       if (!detail::IsGradVar(in_name)) {
         if (detail::IsMemberOf(fwd_op_inputs, in_name)) {
           grad_op->SetInput(in_name, this->Input(in_name));
@@ -398,7 +487,7 @@ class CustomGradOpMaker<OpDesc> : public SingleGradOpMaker<OpDesc> {
       }
     }
     for (auto& out_name : outputs_) {
-      VLOG(1) << "Custom Operator: GradOpDescMaker - output: " << out_name;
+      VLOG(3) << "Custom Operator: GradOpDescMaker - output: " << out_name;
       if (detail::IsDuplicableVar(out_name)) {
         grad_op->SetOutput(out_name,
                            this->InputGrad(detail::NoGrad(out_name),
@@ -447,7 +536,7 @@ class CustomGradOpMaker<imperative::OpBase>
     auto fwd_op_outputs = this->OutputNames();
 
     for (auto& in_name : inputs_) {
-      VLOG(1) << "Custom Operator: GradOpBaseMaker - input: " << in_name;
+      VLOG(3) << "Custom Operator: GradOpBaseMaker - input: " << in_name;
       if (!detail::IsGradVar(in_name)) {
         if (detail::IsMemberOf(fwd_op_inputs, in_name)) {
           grad_op->SetInput(in_name, this->Input(in_name));
@@ -464,7 +553,7 @@ class CustomGradOpMaker<imperative::OpBase>
       }
     }
     for (auto& out_name : outputs_) {
-      VLOG(1) << "Custom Operator: GradOpBaseMaker - output: " << out_name;
+      VLOG(3) << "Custom Operator: GradOpBaseMaker - output: " << out_name;
       grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name)));
     }
     grad_op->SetAttrMap(this->Attrs());
@@ -486,11 +575,11 @@ void RegisterOperatorKernelWithPlace(const std::string& name,
                                      const std::vector<std::string>& outputs,
                                      const std::vector<std::string>& attrs) {
   OpKernelType key(type, experimental::ConvertExtPlaceToInnerPlace(place));
-  VLOG(1) << "Custom Operator: op kernel key: " << key;
+  VLOG(3) << "Custom Operator: op kernel key: " << key;
   OperatorWithKernel::AllOpKernels()[name][key] =
       [kernel_func, inputs, outputs,
        attrs](const framework::ExecutionContext& ctx) {
-        VLOG(1) << "Custom Operator: run custom kernel func in lambda.";
+        VLOG(3) << "Custom Operator: run custom kernel func in lambda.";
         RunKernelFunc(ctx, kernel_func, inputs, outputs, attrs);
       };
 }
@@ -500,7 +589,7 @@ void RegisterOperatorKernel(const std::string& name,
                             const std::vector<std::string>& inputs,
                             const std::vector<std::string>& outputs,
                             const std::vector<std::string>& attrs) {
-  VLOG(1) << "Custom Operator: op name in kernel: " << name;
+  VLOG(3) << "Custom Operator: op name in kernel: " << name;
   // NOTE [ Dummy Op Kernel Key ]
   // TODO(chenweihang): Because execute engine need get device context based
   // op_kernel_key.place_, so we should register kernel for each
@@ -535,12 +624,12 @@ void RegisterOperatorWithMetaInfo(
   auto& infer_shape_func = OpMetaInfoHelper::GetInferShapeFn(base_op_meta);
   auto& infer_dtype_func = OpMetaInfoHelper::GetInferDtypeFn(base_op_meta);
 
-  VLOG(1) << "Custom Operator: forward, op name: " << op_name;
-  VLOG(1) << "Custom Operator: forward, op inputs: "
+  VLOG(3) << "Custom Operator: forward, op name: " << op_name;
+  VLOG(3) << "Custom Operator: forward, op inputs: "
           << string::join_strings(op_inputs, ',');
-  VLOG(1) << "Custom Operator: forward, op outputs: "
+  VLOG(3) << "Custom Operator: forward, op outputs: "
           << string::join_strings(op_outputs, ',');
-  VLOG(1) << "Custom Operator: forward, op attrs: "
+  VLOG(3) << "Custom Operator: forward, op attrs: "
           << string::join_strings(op_attrs, ',');
 
   // Op
@@ -588,96 +677,13 @@ void RegisterOperatorWithMetaInfo(
               "Please set the InferShapeFn of custom "
               "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
 
-      VLOG(1) << "Custom Operator: Default InferShape - share ddim.";
+      VLOG(3) << "Custom Operator: Default InferShape - share ddim.";
       ctx->ShareDim(op_inputs[0], op_outputs[0]);
     };
   } else {
     info.infer_shape_ = [op_inputs, op_outputs, op_attrs,
                          infer_shape_func](InferShapeContext* ctx) {
-      std::vector<std::vector<int64_t>> input_shapes;
-      std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes;
-
-      VLOG(1) << "Custom Operator: InferShape - get input ddim.";
-      for (auto& in_name : op_inputs) {
-        if (detail::IsDuplicableVar(in_name)) {
-          OP_INOUT_CHECK(ctx->HasInputs(in_name), "Input", in_name, "Custom");
-          auto vec_ddim = ctx->GetInputsDim(in_name);
-          std::vector<std::vector<int64_t>> vec_shape;
-          vec_shape.reserve(vec_ddim.size());
-          std::transform(vec_ddim.begin(), vec_ddim.end(),
-                         std::back_inserter(vec_shape),
-                         [&](const DDim& ddim) -> std::vector<int64_t> {
-                           return framework::vectorize(ddim);
-                         });
-          vec_input_shapes.emplace_back(vec_shape);
-        } else {
-          OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
-          auto ddim = ctx->GetInputDim(in_name);
-          input_shapes.emplace_back(framework::vectorize(ddim));
-        }
-      }
-
-      std::vector<paddle::any> custom_attrs;
-      for (auto& attr_str : op_attrs) {
-        auto attr_name_and_type = detail::ParseAttrStr(attr_str);
-        auto attr_name = attr_name_and_type[0];
-        auto attr_type_str = attr_name_and_type[1];
-        if (attr_type_str == "bool") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<bool>(attr_name));
-        } else if (attr_type_str == "int") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<int>(attr_name));
-        } else if (attr_type_str == "float") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<float>(attr_name));
-        } else if (attr_type_str == "int64_t") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<int64_t>(attr_name));
-        } else if (attr_type_str == "std::string") {
-          custom_attrs.emplace_back(ctx->Attrs().Get<std::string>(attr_name));
-        } else if (attr_type_str == "std::vector<int>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<int>>(attr_name));
-        } else if (attr_type_str == "std::vector<float>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<float>>(attr_name));
-        } else if (attr_type_str == "std::vector<int64_t>") {
-          // NOTE(chenweihang): InferShape can't support std::vector<int64_t>
-          // attr type, because the input type is std::vector<int64_t>, only
-          // can use one rule to parse std::vector<int64_t> parameter
-          continue;
-        } else if (attr_type_str == "std::vector<std::string>") {
-          custom_attrs.emplace_back(
-              ctx->Attrs().Get<std::vector<std::string>>(attr_name));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported `%s` type value as custom attribute now. "
-              "Supported data types include `bool`, `int`, `float`, "
-              "`int64_t`, `std::string`, `std::vector<int>`, "
-              "`std::vector<float>`, `std::vector<std::string>`, "
-              "Please check whether the attribute data type and "
-              "data type string are matched.",
-              attr_type_str));
-        }
-      }
-
-      VLOG(1) << "Custom Operator: InferShape - calc output ddim.";
-      auto output_shapes =
-          infer_shape_func(input_shapes, vec_input_shapes, custom_attrs);
-
-      VLOG(1) << "Custom Operator: InferShape - set output ddim.";
-      for (size_t i = 0; i < op_outputs.size(); ++i) {
-        auto out_name = op_outputs[i];
-        if (detail::IsDuplicableVar(out_name)) {
-          std::vector<DDim> vec_ddim;
-          vec_ddim.reserve(output_shapes.size());
-          std::transform(output_shapes.begin(), output_shapes.end(),
-                         std::back_inserter(vec_ddim),
-                         [&](const std::vector<int64_t>& shape) -> DDim {
-                           return framework::make_ddim(shape);
-                         });
-          ctx->SetOutputsDim(out_name, vec_ddim);
-        } else {
-          ctx->SetOutputDim(out_name, framework::make_ddim(output_shapes[i]));
-        }
-      }
+      RunInferShapeFunc(ctx, infer_shape_func, op_inputs, op_outputs, op_attrs);
     };
   }
 
@@ -706,7 +712,7 @@ void RegisterOperatorWithMetaInfo(
               "Please set the InferDtypeFn of custom "
               "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
 
-      VLOG(1) << "Custom Operator: InferDtype - share dtype.";
+      VLOG(3) << "Custom Operator: InferDtype - share dtype.";
       auto dtype = ctx->GetInputDataType(op_inputs[0]);
       ctx->SetOutputDataType(op_outputs[0], dtype);
     };
@@ -716,7 +722,7 @@ void RegisterOperatorWithMetaInfo(
       std::vector<DataType> input_dtypes;
       std::vector<std::vector<DataType>> vec_input_dtypes;
 
-      VLOG(1) << "Custom Operator: InferDtype - get input dtype.";
+      VLOG(3) << "Custom Operator: InferDtype - get input dtype.";
       for (auto& in_name : op_inputs) {
         if (detail::IsDuplicableVar(in_name)) {
           std::vector<DataType> vec_custom_dtype;
@@ -731,10 +737,10 @@ void RegisterOperatorWithMetaInfo(
         }
       }
 
-      VLOG(1) << "Custom Operator: InferDtype - infer output dtype.";
+      VLOG(3) << "Custom Operator: InferDtype - infer output dtype.";
       auto output_dtypes = infer_dtype_func(input_dtypes, vec_input_dtypes);
 
-      VLOG(1) << "Custom Operator: InferDtype - set output dtype.";
+      VLOG(3) << "Custom Operator: InferDtype - set output dtype.";
       for (size_t i = 0; i < op_outputs.size(); ++i) {
         auto out_name = op_outputs[i];
         if (detail::IsDuplicableVar(out_name)) {
@@ -763,11 +769,12 @@ void RegisterOperatorWithMetaInfo(
     auto& grad_op_outputs = OpMetaInfoHelper::GetOutputs(cur_grad_op);
     auto& grad_op_attrs = OpMetaInfoHelper::GetAttrs(cur_grad_op);
     auto& grad_kernel_fn = OpMetaInfoHelper::GetKernelFn(cur_grad_op);
+    auto& grad_infer_shape_fn = OpMetaInfoHelper::GetInferShapeFn(cur_grad_op);
 
-    VLOG(1) << "Custom Operator: backward, op name: " << grad_op_name;
-    VLOG(1) << "Custom Operator: backward, op inputs: "
+    VLOG(3) << "Custom Operator: backward, op name: " << grad_op_name;
+    VLOG(3) << "Custom Operator: backward, op inputs: "
             << string::join_strings(grad_op_inputs, ',');
-    VLOG(1) << "Custom Operator: backward, op outputs: "
+    VLOG(3) << "Custom Operator: backward, op outputs: "
             << string::join_strings(grad_op_outputs, ',');
 
     // GradOpDescMaker
@@ -809,40 +816,52 @@ void RegisterOperatorWithMetaInfo(
     };
 
     // Grad InferShape
-    grad_info.infer_shape_ = [grad_op_inputs,
-                              grad_op_outputs](InferShapeContext* ctx) {
-      // 1. if forward input exists, gradient's shape is same with forward input
-      // default
-      //    [Suitable for most situations]
-      // 2. if forward input not exists, and only contains one grad input and
-      // output,
-      //    use grad input shape as grad output shape
-      //    [Suitable for the situation that forward input is not used as
-      //    backward input]
-      // TODO(chenweihang): support set grad op infershape func if needed
-      for (auto& out_name : grad_op_outputs) {
-        auto fwd_name = detail::NoGrad(out_name);
-        if (detail::IsDuplicableVar(fwd_name)) {
-          // Duplicable forward var must as backward input
-          ctx->ShareDim(fwd_name, out_name);
-        } else {
-          if (ctx->HasInput(fwd_name)) {
+    if (grad_infer_shape_fn == nullptr) {
+      grad_info.infer_shape_ = [grad_op_inputs,
+                                grad_op_outputs](InferShapeContext* ctx) {
+        // 1. if forward input exists, gradient's shape is same with forward
+        // input
+        // default
+        //    [Suitable for most situations]
+        // 2. if forward input not exists, and only contains one grad input and
+        // output,
+        //    use grad input shape as grad output shape
+        //    [Suitable for the situation that forward input is not used as
+        //    backward input]
+        for (auto& out_name : grad_op_outputs) {
+          auto fwd_name = detail::NoGrad(out_name);
+          if (detail::IsDuplicableVar(fwd_name)) {
+            // Duplicable forward var must as backward input
             ctx->ShareDim(fwd_name, out_name);
           } else {
-            PADDLE_ENFORCE_EQ(
-                grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL,
-                true,
-                platform::errors::Unavailable(
-                    "Custom grad operator infershape error. "
-                    "If a custom grad operator contains only one input and "
-                    "only one output, the input shape will be directly set to "
-                    "the output shape. Otherwise, Please set the forward input "
-                    "as the grad operator's input."));
-            ctx->ShareDim(grad_op_inputs[0], out_name);
+            if (ctx->HasInput(fwd_name)) {
+              ctx->ShareDim(fwd_name, out_name);
+            } else {
+              PADDLE_ENFORCE_EQ(
+                  grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL,
+                  true,
+                  platform::errors::Unavailable(
+                      "Custom grad operator infershape error. "
+                      "If a custom grad operator contains only one input and "
+                      "only one output, the input shape will be directly set "
+                      "to "
+                      "the output shape. Otherwise, Please set the forward "
+                      "input "
+                      "as the grad operator's input or  set the InferShapeFn "
+                      "of custom grad operator by "
+                      ".SetInferShapeFn(PD_INFER_SHAPE(...))"));
+              ctx->ShareDim(grad_op_inputs[0], out_name);
+            }
           }
         }
-      }
-    };
+      };
+    } else {
+      grad_info.infer_shape_ = [grad_op_inputs, grad_op_outputs, grad_op_attrs,
+                                grad_infer_shape_fn](InferShapeContext* ctx) {
+        RunInferShapeFunc(ctx, grad_infer_shape_fn, grad_op_inputs,
+                          grad_op_outputs, grad_op_attrs);
+      };
+    }
 
     // Kernel func
     RegisterOperatorKernel(grad_op_name, grad_kernel_fn, grad_op_inputs,
@@ -860,11 +879,11 @@ void RegisterOperatorWithMetaInfo(
 void RegisterOperatorWithMetaInfoMap(
     const paddle::OpMetaInfoMap& op_meta_info_map) {
   auto& meta_info_map = op_meta_info_map.GetMap();
-  VLOG(1) << "Custom Operator: size of op meta info map - "
+  VLOG(3) << "Custom Operator: size of op meta info map - "
           << meta_info_map.size();
   // pair: {op_type, OpMetaInfo}
   for (auto& pair : meta_info_map) {
-    VLOG(1) << "Custom Operator: pair first -> op name: " << pair.first;
+    VLOG(3) << "Custom Operator: pair first -> op name: " << pair.first;
     RegisterOperatorWithMetaInfo(pair.second);
   }
 }
@@ -874,7 +893,7 @@ void RegisterOperatorWithMetaInfoMap(
 // load op api
 void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
   void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
-  VLOG(1) << "load custom_op lib: " << dso_name;
+  VLOG(3) << "load custom_op lib: " << dso_name;
   typedef OpMetaInfoMap& get_op_meta_info_map_t();
   auto* get_op_meta_info_map =
       detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index 4e5be2e53503f..a81e4abd45e56 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 
+#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/include/core.h"
+
 namespace paddle {
 namespace framework {
 
@@ -73,9 +76,12 @@ class TestKernel : public OpKernel<float> {
     output->Resize(input->dims());
     output->mutable_data<T>(ctx.GetPlace());
 
-    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
-        input, input, output, ctx.template device_context<DeviceContext>(),
-        AddFunctor<T>());
+    auto pt_input = paddle::experimental::MakePtenDenseTensor(*input);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*output);
+
+    pten::funcs::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        *pt_input, *pt_input, pt_out.get(),
+        ctx.template device_context<DeviceContext>(), AddFunctor<T>());
     functor.Run();
   }
 };
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index 947f06408d028..5a47b0f655f4f 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -18,58 +18,4 @@ limitations under the License. */
 #include <ostream>
 #include <string>
 
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-enum class DataLayout {
-  kNHWC = 0,
-  kNCHW = 1,
-  kAnyLayout = 2,
-  kMKLDNN = 3,  // all layouts supported by MKLDNN internally
-};
-
-inline DataLayout StringToDataLayout(const std::string& str) {
-  std::string s(str);
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
-
-  if (s == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (s == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (s == "ANYLAYOUT") {
-    return DataLayout::kAnyLayout;
-  } else if (s == "MKLDNNLAYOUT") {
-    return DataLayout::kMKLDNN;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Unknown data layout type string: %s.", s));
-  }
-}
-
-inline std::string DataLayoutToString(const DataLayout& data_layout) {
-  switch (data_layout) {
-    case DataLayout::kNHWC:
-      return "NHWC";
-    case DataLayout::kNCHW:
-      return "NCHW";
-    case DataLayout::kAnyLayout:
-      return "ANY_LAYOUT";
-    case DataLayout::kMKLDNN:
-      return "MKLDNNLAYOUT";
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unknown Data Layout type %d.", data_layout));
-  }
-}
-
-inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) {
-  out << DataLayoutToString(l);
-  return out;
-}
-
-}  // namespace framework
-}  // namespace paddle
+#include "paddle/pten/common/layout.h"
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 70693a5df2609..d8c372becf1b4 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -43,9 +43,8 @@ void TransformData(const OpKernelType &expected_kernel_type,
   Tensor in;
   in.ShareDataWith(input_tensor);
   Tensor out;
-  DataLayout lin = kernel_type_for_var.data_layout_;
-  DataLayout lout = expected_kernel_type.data_layout_;
-
+  const DataLayout lin = kernel_type_for_var.data_layout_;
+  const DataLayout lout = expected_kernel_type.data_layout_;
   // do layout transform
   if (NeedTransformLayout(lout, lin)) {
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 08749b6b7515b..ec8284b825500 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -89,6 +89,22 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(callback, int, INT32); \
   _ForEachDataTypeHelper_(callback, int64_t, INT64);
 
+// It's only for DataParallel in HIP, bf16 not support in HIP.
+#define _ForEachDataTypeForHIP_(callback)                                \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);  \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                         \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                     \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                     \
+  _ForEachDataTypeHelper_(callback, int8_t, INT8);                       \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,  \
+                          COMPLEX64);                                    \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
+                          COMPLEX128);
+
 #define DefineDataTypeTrait(cpp_type, proto_type)                           \
   template <>                                                               \
   struct DataTypeTrait<cpp_type> {                                          \
@@ -147,6 +163,20 @@ inline void VisitDataTypeTiny(proto::VarType::Type type, Visitor visitor) {
 #undef VisitDataTypeCallbackTiny
 }
 
+template <typename Visitor>
+inline void VisitDataTypeForHIP(proto::VarType::Type type, Visitor visitor) {
+#define VisitDataTypeCallbackHIP(cpp_type, proto_type) \
+  do {                                                 \
+    if (type == proto_type) {                          \
+      visitor.template apply<cpp_type>();              \
+      return;                                          \
+    }                                                  \
+  } while (0)
+
+  _ForEachDataTypeForHIP_(VisitDataTypeCallbackHIP);
+#undef VisitDataTypeCallbackHIP
+}
+
 extern std::string DataTypeToString(const proto::VarType::Type type);
 extern size_t SizeOfType(proto::VarType::Type type);
 inline std::ostream& operator<<(std::ostream& out,
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index b1573093ec333..f93202769dbd0 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -153,7 +153,7 @@ void AllReduceOpHandle::AllReduceImpl(
                           "The place type of tensors of the same variable "
                           "in different local scopes should be equal."));
 
-    lod_tensor_data.emplace_back(lod_tensor.data<void>());
+    lod_tensor_data.emplace_back(lod_tensor.data());
     places.emplace_back(lod_tensor.place());
 
     VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
@@ -225,7 +225,7 @@ void AllReduceOpHandle::AllReduceFunc(
                      ->GetMutable<LoDTensor>();
 
     // Reduce All Tensor to trg in CPU
-    ReduceBufferData func(lod_tensor_data, trg.data<void>(), numel);
+    ReduceBufferData func(lod_tensor_data, trg.data(), numel);
     VisitDataType(trg.type(), func);
 
     for (size_t i = 1; i < local_exec_scopes_.size(); ++i) {
@@ -235,9 +235,9 @@ void AllReduceOpHandle::AllReduceFunc(
 
       size_t size = numel * SizeOfType(trg.type());
       RunAndRecordEvent(p, [&trg, var, p, size] {
-        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
+        auto dst_ptr = var->GetMutable<framework::LoDTensor>()->data();
         platform::CPUPlace cpu_place;
-        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data<void>(), size);
+        memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data(), size);
       });
     }
   }
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index a11a244214d4f..01dc5a45146f1 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -101,7 +101,7 @@ void BroadcastOpHandle::BroadcastOneVar(
 
       void *send_recv_buffer = nullptr;
       if (root_id == dst_id) {
-        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
+        send_recv_buffer = const_cast<void *>(in_tensor.data());
         out_handle = out_var_handle;
       } else {
         send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
@@ -162,7 +162,7 @@ void BroadcastOpHandle::BroadcastOneVar(
 
       void *send_recv_buffer = nullptr;
       if (root_id == dst_id) {
-        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
+        send_recv_buffer = const_cast<void *>(in_tensor.data());
         out_handle = out_var_handle;
       } else {
         send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index bd153f24fa318..b65d4e4fcd55a 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -220,17 +220,17 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
         g_tensor.begin(), g_tensor.end(),
         [](const std::pair<std::string, const LoDTensor *> &grad1,
            const std::pair<std::string, const LoDTensor *> &grad2) -> bool {
-          return grad1.second->data<void>() < grad2.second->data<void>();
+          return grad1.second->data() < grad2.second->data();
         });
 
     size_t size_of_dtype = framework::SizeOfType(dtype);
     for (size_t k = 1; k < g_tensor.size(); ++k) {
-      const void *cur_address = g_tensor.at(k - 1).second->data<void>();
+      const void *cur_address = g_tensor.at(k - 1).second->data();
       int64_t len = g_tensor.at(k - 1).second->numel();
       auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
       void *infer_next_address = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(cur_address) + offset);
-      const void *next_address = g_tensor.at(k).second->data<void>();
+      const void *next_address = g_tensor.at(k).second->data();
 
       VLOG(10) << string::Sprintf(
           "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer "
@@ -267,7 +267,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
   std::vector<const void *> lod_tensor_data;
   lod_tensor_data.reserve(place_num);
   for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
-    auto data = grads_tensor.at(scope_idx).at(0).second->data<void>();
+    auto data = grads_tensor.at(scope_idx).at(0).second->data();
     lod_tensor_data.emplace_back(data);
   }
   std::vector<std::string> grad_var_names;
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 25b5eefc05cda..fe21a62efd087 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -110,9 +110,7 @@ void OpHandleBase::InitXPU() {
                           "%s should have only one dev_ctx.", Name()));
     auto &place = dev_ctxes_.begin()->first;
     int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
-    PADDLE_ENFORCE_EQ(
-        xpu_set_device(dev_id), XPU_SUCCESS,
-        platform::errors::PreconditionNotMet("xpu_set_device failed"));
+    platform::SetXPUDeviceId(dev_id);
     for (auto &out_var : outputs_) {
       auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
       if (out_var_handle) {
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index bbc458804a195..196f7a3d4a4bf 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -159,7 +159,7 @@ void ReduceOpHandle::RunImpl() {
           VisitDataType(lod_tensors[0]->type(), func);
 
           auto trg = out_var->GetMutable<framework::LoDTensor>();
-          if (reduce_sum_trg.data<void>() != trg->data<void>()) {
+          if (reduce_sum_trg.data() != trg->data()) {
             TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg);
           }
         }
@@ -181,7 +181,7 @@ void ReduceOpHandle::RunImpl() {
         int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
         auto &nccl_ctx = nccl_ctxs_->at(dev_id);
 
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *buffer = const_cast<void *>(lod_tensor.data());
         void *recvbuffer = nullptr;
         if (root_id == dev_id) {
           recvbuffer =
@@ -227,7 +227,7 @@ void ReduceOpHandle::RunImpl() {
         int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device;
         auto &bkcl_ctx = bkcl_ctxs_->at(dev_id);
 
-        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *buffer = const_cast<void *>(lod_tensor.data());
         void *recvbuffer = nullptr;
         if (root_id == dev_id) {
           recvbuffer =
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index d916b9bc26276..ed485ed587c0b 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -146,7 +146,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
   for (size_t i = 0; i < local_scopes_.size(); ++i) {
     auto &place = places_[i];
     auto &in = *ins[i];
-    void *in_tensor_buf = const_cast<void *>(in.data<void>());
+    void *in_tensor_buf = const_cast<void *>(in.data());
 
     auto &out = *outs[i];
     float *out_tensor_buf = out.data<float>();
@@ -175,7 +175,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
     // dgc use ncclAllGather to get all the encoded data
     // so the buffer need nranks.
     int buf_size = nranks_ * encode_size;
-    void *gather_buff = gathers[i]->data<void>();
+    void *gather_buff = gathers[i]->data();
 
     VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
              << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 7380e0f129cf4..28108e78d9d99 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -45,6 +45,7 @@ message ShardingConfig {
   optional bool optimize_cast = 12 [ default = false ];
   // Optimizer sharding. Temporary plans and may be deprecated
   optional bool _dp_as_optimizer_sharding = 13 [ default = false ];
+  optional int32 stage = 14 [ default = 1 ];
 }
 
 message HybridConfig {
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index cef1016aa5340..95913664961b3 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -134,7 +134,7 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
 
 DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {
   // init data, data buffer
-  t_.data = const_cast<void *>(tensor.data<void>());
+  t_.data = const_cast<void *>(tensor.data());
 
   // init device, DLDevice type with device_type and device_id
   auto place = tensor.place();
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index a6abda8a83bc8..2970c52db1a48 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -57,7 +57,7 @@ struct EigenTensor {
   }
 
   static Type From(Tensor& tensor) {  // NOLINT
-    return From(tensor, tensor.dims_);
+    return From(tensor, tensor.dims());
   }  // NOLINT
 
   static ConstType From(const Tensor& tensor, DDim dims) {
@@ -65,7 +65,7 @@ struct EigenTensor {
   }
 
   static ConstType From(const Tensor& tensor) {
-    return From(tensor, tensor.dims_);
+    return From(tensor, tensor.dims());
   }
 };
 
@@ -74,7 +74,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
   static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
                                             int num_col_dims) {
-    int rank = tensor.dims_.size();
+    int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank), true,
                       platform::errors::InvalidArgument(
                           "Input dimension number(num_col_dims) must be "
@@ -86,7 +86,7 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
 
   static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
                                                  int num_col_dims) {
-    int rank = tensor.dims_.size();
+    int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank), true,
                       platform::errors::InvalidArgument(
                           "Input dimension number(num_col_dims) must be "
@@ -102,12 +102,12 @@ template <typename T, int MajorType = Eigen::RowMajor,
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
   // Flatten reshapes a Tensor into an EigenVector.
   static typename EigenVector::Type Flatten(Tensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {product(tensor.dims_)});
+    return EigenVector::From(tensor, {product(tensor.dims())});
   }
 
   static typename EigenVector::ConstType Flatten(
       const Tensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {product(tensor.dims_)});
+    return EigenVector::From(tensor, {product(tensor.dims())});
   }
 };
 
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 36c5b13701361..19d71076542bc 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WITH_PSLIB)
     else()
         set(BRPC_DEPS brpc)
     endif(WITH_PSLIB_BRPC)
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope ${BRPC_DEPS} pslib)
+    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto proto_desc op_registry variable_helper scope ${BRPC_DEPS} pslib)
 else()
     cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_PSLIB)
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index 82ce3b28776f1..4127adf1bfe27 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -150,8 +150,7 @@ class AscendInstance {
                                VarTypeToGeType(tensor->type()));
     tensor_desc.SetRealDimCnt(vec_dim.size());
 
-    const uint8_t *data =
-        reinterpret_cast<const uint8_t *>(tensor->data<void>());
+    const uint8_t *data = reinterpret_cast<const uint8_t *>(tensor->data());
     std::vector<uint8_t> dst(numel * GeTypeSize(tensor->type()));
     memcpy(dst.data(), data, GeTypeSize(tensor->type()) * numel);
     ge::Tensor ge_tensor(tensor_desc, dst);
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 68868f447b5c3..45f9b04383944 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -39,22 +39,45 @@ namespace framework {
 class HeterContext {
  public:
   ~HeterContext() {
-    for (size_t i = 0; i < mutex_.size(); ++i) {
-      delete mutex_[i];
+    if (!multi_mf_dim_) {
+      for (size_t i = 0; i < mutex_.size(); ++i) {
+        delete mutex_[i];
+      }
+      mutex_.clear();
+    } else {
+      for (size_t i = 0; i < dim_mutex_.size(); ++i) {
+        for (size_t j = 0; j < dim_mutex_[i].size(); j++) {
+          delete dim_mutex_[i][j];
+        }
+        dim_mutex_[i].clear();
+      }
     }
-    mutex_.clear();
   }
   Scope* scope_{nullptr};
   std::vector<std::vector<FeatureKey>> feature_keys_;
+  std::vector<std::vector<std::vector<FeatureKey>>> feature_dim_keys_;
+
 #ifdef PADDLE_WITH_PSLIB
   std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> value_ptr_;
+  std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>
+      value_dim_ptr_;
+  std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>
+      device_dim_ptr_;
 #endif
 #ifdef PADDLE_WITH_PSCORE
   std::vector<std::vector<paddle::distributed::VALUE*>> value_ptr_;
+  std::vector<std::vector<std::vector<paddle::distributed::VALUE*>>>
+      value_dim_ptr_;
+  std::vector<std::vector<std::vector<paddle::distributed::VALUE*>>>
+      device_dim_ptr_;
 #endif
   std::vector<std::vector<FeatureValue>> device_values_;
   std::vector<std::vector<FeatureKey>> device_keys_;
+  std::vector<std::vector<std::vector<FeatureKey>>> device_dim_keys_;
+  std::vector<std::vector<std::vector<FeatureValue>>> device_dim_values_;
   std::vector<std::mutex*> mutex_;
+  std::vector<std::vector<std::mutex*>> dim_mutex_;
+  int multi_mf_dim_ = 0;
 
   uint32_t shard_num_ = 37;
   uint64_t size() {
@@ -79,18 +102,78 @@ class HeterContext {
     }
   }
 
-  void Reset() {
-    for (size_t i = 0; i < feature_keys_.size(); ++i) {
-      feature_keys_[i].clear();
+  void init(int shard_num, int device_num, int dim_num) {
+    shard_num_ = shard_num;
+    feature_keys_.resize(shard_num_);
+    feature_dim_keys_.resize(shard_num_);
+    value_ptr_.resize(shard_num_);
+    value_dim_ptr_.resize(shard_num_);
+    for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
+      feature_dim_keys_[i].resize(dim_num);
+      value_dim_ptr_[i].resize(dim_num);
+      if (i == 0) {
+        for (int j = 0; j < dim_num; j++) {
+          feature_dim_keys_[i][j].push_back(0);
+        }
+      }
     }
-    for (size_t i = 0; i < value_ptr_.size(); ++i) {
-      value_ptr_[i].clear();
+    device_values_.resize(device_num);
+    device_dim_values_.resize(device_num);
+    device_keys_.resize(device_num);
+
+    device_dim_keys_.resize(device_num);
+    device_dim_ptr_.resize(device_num);
+    mutex_.resize(device_num);
+    dim_mutex_.resize(device_num);
+    for (size_t i = 0; i < mutex_.size(); ++i) {
+      mutex_[i] = new std::mutex();
     }
-    for (size_t i = 0; i < device_values_.size(); ++i) {
-      device_values_[i].clear();
+    for (size_t i = 0; i < dim_mutex_.size(); ++i) {
+      dim_mutex_[i].resize(dim_num);
+      for (int j = 0; j < dim_num; j++) {
+        dim_mutex_[i][j] = new std::mutex();
+      }
     }
-    for (size_t i = 0; i < device_keys_.size(); ++i) {
-      device_keys_[i].clear();
+    multi_mf_dim_ = dim_num;
+  }
+
+  void Reset() {
+    if (!multi_mf_dim_) {
+      for (size_t i = 0; i < feature_keys_.size(); ++i) {
+        feature_keys_[i].clear();
+      }
+      for (size_t i = 0; i < value_ptr_.size(); ++i) {
+        value_ptr_[i].clear();
+      }
+      for (size_t i = 0; i < device_values_.size(); ++i) {
+        device_values_[i].clear();
+      }
+      for (size_t i = 0; i < device_keys_.size(); ++i) {
+        device_keys_[i].clear();
+      }
+    } else {
+      VLOG(3) << "Reset gpu task with dynamic mf dimention";
+      for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
+        for (size_t j = 0; j < feature_dim_keys_[i].size(); j++) {
+          feature_dim_keys_[i][j].clear();
+        }
+      }
+      for (size_t i = 0; i < value_dim_ptr_.size(); i++) {
+        for (size_t j = 0; j < value_dim_ptr_[i].size(); j++) {
+          value_dim_ptr_[i][j].clear();
+        }
+      }
+
+      for (size_t i = 0; i < device_dim_keys_.size(); i++) {
+        for (size_t j = 0; j < device_dim_keys_[i].size(); j++) {
+          device_dim_keys_[i][j].clear();
+        }
+      }
+      for (size_t i = 0; i < device_dim_ptr_.size(); i++) {
+        for (size_t j = 0; j < device_dim_ptr_[i].size(); j++) {
+          device_dim_ptr_[i][j].clear();
+        }
+      }
     }
   }
   void batch_add_keys(
@@ -115,6 +198,15 @@ class HeterContext {
               feature_keys_[shard_num].begin() + idx);
   }
 
+  void batch_add_keys(int shard_num, int dim_id,
+                      const robin_hood::unordered_set<uint64_t>& shard_keys) {
+    int idx = feature_dim_keys_[shard_num][dim_id].size();
+    feature_dim_keys_[shard_num][dim_id].resize(
+        feature_dim_keys_[shard_num][dim_id].size() + shard_keys.size());
+    std::copy(shard_keys.begin(), shard_keys.end(),
+              feature_dim_keys_[shard_num][dim_id].begin() + idx);
+  }
+
   void UniqueKeys() {
     std::vector<std::thread> threads;
     auto unique_func = [this](int i) {
@@ -124,9 +216,26 @@ class HeterContext {
       it = std::unique(cur_keys.begin(), cur_keys.end());
       cur_keys.resize(std::distance(cur_keys.begin(), it));
     };
-    for (uint32_t i = 0; i < shard_num_; i++) {
-      threads.push_back(std::thread(unique_func, i));
+    auto unique_dynamic_mf_func = [this](int i, int j) {
+      auto& cur_keys = feature_dim_keys_[i][j];
+      std::sort(cur_keys.begin(), cur_keys.end());
+      std::vector<FeatureKey>::iterator it;
+      it = std::unique(cur_keys.begin(), cur_keys.end());
+      cur_keys.resize(std::distance(cur_keys.begin(), it));
+    };
+    if (!multi_mf_dim_) {
+      for (uint32_t i = 0; i < shard_num_; i++) {
+        threads.push_back(std::thread(unique_func, i));
+      }
+    } else {
+      for (uint32_t i = 0; i < shard_num_; i++) {
+        for (int j = 0; j < multi_mf_dim_; j++) {
+          threads.push_back(std::thread(unique_dynamic_mf_func, i, j));
+        }
+      }
+      VLOG(3) << "heter_context unique keys with dynamic mf dimention";
     }
+
     for (std::thread& t : threads) {
       t.join();
     }
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 939b5e3099a62..189724a545520 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -7,7 +7,7 @@ IF(WITH_GPU)
         get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
         SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
     endif()
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
 ENDIF()
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index e7f098320c6c7..509b43431b572 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -27,6 +27,8 @@ limitations under the License. */
 #include "thrust/pair.h"
 // #include "cudf/concurrent_unordered_map.cuh.h"
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 
@@ -53,8 +55,11 @@ class HashTable {
   HashTable& operator=(const HashTable&) = delete;
   void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
               gpuStream_t stream);
+  void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index,
+              gpuStream_t stream);
   void get(const KeyType* d_keys, ValType* d_vals, size_t len,
            gpuStream_t stream);
+  void get(const KeyType* d_keys, char* d_vals, size_t len, gpuStream_t stream);
   void show();
   void dump_to_cpu(int devid, cudaStream_t stream);
 
@@ -62,8 +67,20 @@ class HashTable {
   void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
               Sgd sgd, gpuStream_t stream);
 
+  template <typename Sgd>
+  void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd,
+              gpuStream_t stream);
+
   int size() { return container_->size(); }
 
+  void set_feature_value_size(size_t pull_feature_value_size,
+                              size_t push_grad_value_size) {
+    pull_feature_value_size_ = pull_feature_value_size;
+    push_grad_value_size_ = push_grad_value_size;
+    VLOG(3) << "hashtable set pull value size: " << pull_feature_value_size_
+            << " push value size: " << push_grad_value_size_;
+  }
+
   std::unique_ptr<RWLock> rwlock_{nullptr};
 
  private:
@@ -71,6 +88,9 @@ class HashTable {
   int BLOCK_SIZE_{256};
   float LOAD_FACTOR{0.75f};
   size_t capacity_;
+  size_t max_mf_dim_ = 8;
+  size_t pull_feature_value_size_;
+  size_t push_grad_value_size_;
 };
 }  // end namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index 9f3d1a7adcafc..dec7357468558 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -42,6 +42,23 @@ __global__ void insert_kernel(Table* table,
   }
 }
 
+template <typename Table>
+__global__ void insert_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              size_t len, char* pool, int start_index) {
+  ReplaceOp<typename Table::mapped_type> op;
+  thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
+
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < len) {
+    kv.first = keys[i];
+    kv.second = (Table::mapped_type)(pool + (start_index + i) * 80);
+    auto it = table->insert(kv, op);
+    assert(it != table->end() && "error: insert fails: table is full");
+  }
+}
+
 template <typename Table>
 __global__ void search_kernel(Table* table,
                               const typename Table::key_type* const keys,
@@ -56,6 +73,20 @@ __global__ void search_kernel(Table* table,
   }
 }
 
+template <typename Table>
+__global__ void dy_mf_search_kernel(Table* table,
+                                    const typename Table::key_type* const keys,
+                                    char* const vals, size_t len,
+                                    size_t pull_feature_value_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto it = table->find(keys[i]);
+
+    if (it != table->end()) {
+      *(FeatureValue*)(vals + i * pull_feature_value_size) = *(it->second);
+    }
+  }
+}
 template <typename Table, typename GradType, typename Sgd>
 __global__ void update_kernel(Table* table,
                               const typename Table::key_type* const keys,
@@ -70,6 +101,23 @@ __global__ void update_kernel(Table* table,
   }
 }
 
+template <typename Table, typename Sgd>
+__global__ void dy_mf_update_kernel(Table* table,
+                                    const typename Table::key_type* const keys,
+                                    const char* const grads, size_t len,
+                                    Sgd sgd, size_t grad_value_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto it = table->find(keys[i]);
+    if (it != table->end()) {
+      FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
+      sgd.dy_mf_update_value((it.getter())->second, *cur);
+    } else {
+      printf("yxf::push miss key: %d", keys[i]);
+    }
+  }
+}
+
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::HashTable(size_t capacity) {
   container_ = new TableContainer<KeyType, ValType>(capacity);
@@ -97,6 +145,17 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
                                                        d_vals, len);
 }
 
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
+                                      size_t len, gpuStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, d_keys, d_vals, len, pull_feature_value_size_);
+}
+
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
                                          const ValType* d_vals, size_t len,
@@ -109,6 +168,21 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
                                                        d_vals, len);
 }
 
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
+                                         char* pool, size_t start_index,
+                                         gpuStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  if (pool == NULL) {
+    return;
+  }
+  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys, len,
+                                                       pool, start_index);
+}
+
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
   container_->prefetch(cudaCpuDeviceId, stream);
@@ -166,6 +240,20 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                                        d_grads, len, sgd);
 }
 
+template <typename KeyType, typename ValType>
+template <typename Sgd>
+void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
+                                         const char* d_grads, size_t len,
+                                         Sgd sgd, gpuStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+
+  dy_mf_update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, d_keys, d_grads, len, sgd, push_grad_value_size_);
+}
+
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
index ccdb6c5cdd64e..cad7559af5742 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -104,6 +104,13 @@ int HeterPsResource::get_index_by_devid(int devid) {
 
 int HeterPsResource::total_gpu() { return dev_ids_.size(); }
 
+void HeterPsResource::set_multi_mf(int multi_mf_dim, int max_mf_dim) {
+  multi_mf_dim_ = multi_mf_dim;
+  max_mf_dim_ = max_mf_dim;
+  VLOG(3) << "heter resource set mf dim: " << multi_mf_dim_
+          << " max_mf_dim_: " << max_mf_dim_;
+}
+
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index 7bc52e52e6887..19df8cc70f50e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -56,6 +56,7 @@ class HeterPsResource {
   int total_gpu();
   int get_index_by_devid(int devid);
   int dev_id(int num);
+  void set_multi_mf(int multi_mf_dim, int max_mf_dim);
   gpuStream_t local_stream(int gpu_num, int stream_num);
   gpuStream_t remote_stream(int gpu_num, int stream_num);
   gpuStream_t comm_stream(int gpu_num, int stream_num);
@@ -63,6 +64,8 @@ class HeterPsResource {
   std::vector<std::shared_ptr<GPUResource>> resources_;
   std::vector<int> dev_ids_;
   std::map<int, int> devid_2_index_;
+  int multi_mf_dim_{0};
+  int max_mf_dim_{0};
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
new file mode 100644
index 0000000000000..9189902c28ffb
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HETERPS
+// #include
+// "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
+#include <iostream>
+#include "paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh"
+
+namespace paddle {
+namespace framework {
+
+class MemoryPool {
+ public:
+  MemoryPool(size_t capacity, size_t block_size)
+      : capacity_(capacity), block_size_(block_size) {
+    VLOG(3) << "mem_pool init with block_size: " << block_size
+            << " capacity: " << capacity;
+    mem_ = (char*)malloc(block_size * capacity_);
+  }
+  ~MemoryPool() {
+    VLOG(3) << "mem pool delete";
+    free(mem_);
+  }
+  size_t block_size() { return block_size_; }
+  char* mem() { return mem_; }
+
+  size_t capacity() { return capacity_; }
+  size_t byte_size() { return capacity_ * block_size_; }
+  void* mem_address(const uint32_t& idx) {
+    return (void*)&mem_[(idx)*block_size_];
+  }
+
+ private:
+  char* mem_ = NULL;
+  size_t capacity_;
+  size_t block_size_;
+};
+
+class HBMMemoryPool : public managed {
+ public:
+  HBMMemoryPool(size_t capacity, size_t block_size)
+      : capacity_(capacity), block_size_(block_size) {}
+  HBMMemoryPool(MemoryPool* mem_pool) {
+    capacity_ = mem_pool->capacity();
+    block_size_ = mem_pool->block_size();
+    VLOG(3) << "hbm memory pool with capacity" << capacity_
+            << " bs: " << block_size_;
+    cudaMalloc(&mem_, block_size_ * capacity_);
+    cudaMemcpy(mem_, mem_pool->mem(), mem_pool->byte_size(),
+               cudaMemcpyHostToDevice);
+  }
+
+  ~HBMMemoryPool() {
+    VLOG(3) << "delete hbm memory pool";
+    cudaFree(mem_);
+  }
+
+  size_t block_size() { return block_size_; }
+
+  void clear(void) { cudaMemset(mem_, 0, block_size_ * capacity_); }
+
+  void reset(size_t capacity) {
+    cudaFree(mem_);
+    mem_ = NULL;
+    capacity_ = capacity;
+    cudaMalloc(&mem_, (block_size_ * capacity / 8 + 1) * 8);
+    cudaMemset(mem_, 0, block_size_ * capacity);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) {
+    for (size_t k = 0; k < 5; k++) {
+      auto x = (FeatureValue*)(p.mem() + k * p.capacity());
+      out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot
+          << " lr: " << x->lr << " mf_dim: " << x->mf_size
+          << " mf_size: " << x->mf_size << " mf:";
+      for (int i = 0; i < x->mf_size + 1; ++i) {
+        out << " " << x->mf[i];
+      }
+      out << "\n";
+    }
+    return out;
+  }
+
+  char* mem() { return mem_; }
+
+  size_t capacity() { return capacity_; }
+  __forceinline__ __device__ void* mem_address(const uint32_t& idx) {
+    return (void*)&mem_[(idx)*block_size_];
+  }
+
+ private:
+  char* mem_ = NULL;
+  size_t capacity_;
+  size_t block_size_;
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index 374984ecdb6b6..ff9976db5d875 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -96,6 +96,40 @@ class Optimizer {
       update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show);
     }
   }
+
+  __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad) {
+    ptr->slot = grad.slot;
+    ptr->show += grad.show;
+    ptr->clk += grad.clk;
+    ptr->delta_score +=
+        optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
+        optimizer_config::clk_coeff * grad.clk;
+
+    update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
+    // use MF_DIM temporarily
+    // ptr->mf_dim = grad.mf_dim;
+
+    if (ptr->mf_size == 0) {
+      if (optimizer_config::mf_create_thresholds <=
+          optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) +
+              optimizer_config::clk_coeff * ptr->clk) {
+        // ptr->mf_size = ptr->mf_dim + 1;
+
+        ptr->mf_size = MF_DIM + 1;
+        ptr->mf[0] = 0;
+        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+        curandState state;
+        curand_init(clock64(), tid_x, 0, &state);
+        for (int i = 0; i < MF_DIM; ++i) {
+          ptr->mf[i + 1] =
+              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+        }
+      }
+    } else {
+      update_mf(MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
+                grad.show);  // for local test
+    }
+  }
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index a67f9a5e2c733..66f0d116f2412 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -112,20 +112,19 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
   char* data_ptr = const_cast<char*>(req_data->data());
 
   if (platform::is_cpu_place(tensor->place())) {
-    memcpy(data_ptr, tensor->data<void>(),
+    memcpy(data_ptr, tensor->data(),
            tensor->numel() * SizeOfType(tensor->type()));
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * SizeOfType(tensor->type()), nullptr);
+                 tensor->data(), tensor->numel() * SizeOfType(tensor->type()),
+                 nullptr);
 #endif
 #ifdef PADDLE_WITH_XPU
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
-                 tensor->data<void>(),
-                 tensor->numel() * SizeOfType(tensor->type()));
+                 tensor->data(), tensor->numel() * SizeOfType(tensor->type()));
 #endif
   }
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 264155d79c0d0..2b712d8cc5db8 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -45,16 +45,30 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
   timeline.Start();
   int device_num = heter_devices_.size();
-  gpu_task->init(thread_keys_shard_num_, device_num);
+  if (!multi_mf_dim_) {
+    gpu_task->init(thread_keys_shard_num_, device_num);
+  } else {
+    gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
+  }
   auto& local_keys = gpu_task->feature_keys_;
   auto& local_ptr = gpu_task->value_ptr_;
 
   std::vector<std::thread> threads;
 
   // data should be in input channel
-  thread_keys_.resize(thread_keys_thread_num_);
-  for (int i = 0; i < thread_keys_thread_num_; i++) {
-    thread_keys_[i].resize(thread_keys_shard_num_);
+  if (!multi_mf_dim_) {
+    thread_keys_.resize(thread_keys_thread_num_);
+    for (int i = 0; i < thread_keys_thread_num_; i++) {
+      thread_keys_[i].resize(thread_keys_shard_num_);
+    }
+  } else {
+    thread_dim_keys_.resize(thread_keys_thread_num_);
+    for (int i = 0; i < thread_keys_thread_num_; i++) {
+      thread_dim_keys_[i].resize(thread_keys_shard_num_);
+      for (int j = 0; j < thread_keys_shard_num_; j++) {
+        thread_dim_keys_[i][j].resize(multi_mf_dim_);
+      }
+    }
   }
 
   size_t total_len = 0;
@@ -87,10 +101,47 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
         }
       }
     };
+    auto gen_dynamic_mf_func = [this](const std::deque<SlotRecord>& total_data,
+                                      int begin_index, int end_index, int i) {
+      for (auto iter = total_data.begin() + begin_index;
+           iter != total_data.begin() + end_index; iter++) {
+        const auto& ins = *iter;
+        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
+        const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets;
+        for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size();
+             slot_idx++) {
+          for (size_t j = slot_offset[slot_offset_vector_[slot_idx]];
+               j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) {
+            int shard_id = feasign_v[j] % thread_keys_shard_num_;
+            int dim_id = slot_index_vec_[slot_idx];
+            this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]);
+          }
+        }
+      }
+      /*
+      for (auto iter = total_data.begin() + begin_index;
+           iter != total_data.begin() + end_index; iter++) {
+        const auto& ins = *iter;
+        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
+        for (const auto feasign : feasign_v) {
+          int shard_id = feasign % thread_keys_shard_num_;
+          this->thread_dim_keys_[i][shard_id][0].insert(feasign);
+        }
+      }
+      */
+    };
     for (int i = 0; i < thread_keys_thread_num_; i++) {
-      threads.push_back(
-          std::thread(gen_func, std::ref(vec_data), begin,
-                      begin + len_per_thread + (i < remain ? 1 : 0), i));
+      if (!multi_mf_dim_) {
+        VLOG(0) << "yxf::psgpu wrapper genfunc";
+        threads.push_back(
+            std::thread(gen_func, std::ref(vec_data), begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+      } else {
+        VLOG(0) << "yxf::psgpu wrapper genfunc with dynamic mf";
+        threads.push_back(
+            std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+      }
       begin += len_per_thread + (i < remain ? 1 : 0);
     }
     for (std::thread& t : threads) {
@@ -144,7 +195,13 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       thread_keys_[i][shard_num].clear();
     }
   };
-
+  auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
+    for (int i = 0; i < thread_keys_thread_num_; ++i) {
+      gpu_task->batch_add_keys(shard_num, dim_id,
+                               thread_dim_keys_[i][shard_num][dim_id]);
+      thread_dim_keys_[i][shard_num][dim_id].clear();
+    }
+  };
   // for (size_t i = 0; i < thread_keys_.size(); i++) {
   //  gpu_task->batch_add_keys(thread_keys_[i]);
   //  for (int j = 0; j < thread_keys_thread_num_; j++) {
@@ -152,7 +209,13 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   //  }
   //}
   for (int i = 0; i < thread_keys_shard_num_; ++i) {
-    threads.push_back(std::thread(merge_ins_func, i));
+    if (!multi_mf_dim_) {
+      threads.push_back(std::thread(merge_ins_func, i));
+    } else {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
+      }
+    }
   }
   for (auto& t : threads) {
     t.join();
@@ -167,9 +230,20 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
 
   VLOG(1) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
 
-  for (int i = 0; i < thread_keys_shard_num_; i++) {
-    VLOG(3) << "GpuPs shard: " << i << " key len: " << local_keys[i].size();
-    local_ptr[i].resize(local_keys[i].size());
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      VLOG(0) << "GpuPs shard: " << i << " key len: " << local_keys[i].size();
+      local_ptr[i].resize(local_keys[i].size());
+    }
+  } else {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
+                << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
+        gpu_task->value_dim_ptr_[i][j].resize(
+            gpu_task->feature_dim_keys_[i][j].size());
+      }
+    }
   }
 }
 
@@ -179,8 +253,20 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   auto& local_keys = gpu_task->feature_keys_;
   auto& local_ptr = gpu_task->value_ptr_;
 
+  auto& local_dim_keys = gpu_task->feature_dim_keys_;
+  auto& local_dim_ptr = gpu_task->value_dim_ptr_;
+
   auto& device_keys = gpu_task->device_keys_;
   auto& device_vals = gpu_task->device_values_;
+  auto& device_dim_keys = gpu_task->device_dim_keys_;
+  auto& device_dim_ptr = gpu_task->device_dim_ptr_;
+  auto& device_dim_mutex = gpu_task->dim_mutex_;
+  if (multi_mf_dim_) {
+    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+      device_dim_keys[dev].resize(multi_mf_dim_);
+      device_dim_ptr[dev].resize(multi_mf_dim_);
+    }
+  }
   auto& device_mutex = gpu_task->mutex_;
 
   std::vector<std::thread> threads(thread_keys_shard_num_);
@@ -283,8 +369,63 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
               << local_keys[i].size();
     }
   };
-  for (size_t i = 0; i < threads.size(); i++) {
-    threads[i] = std::thread(ptl_func, i);
+
+  auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
+                              &fleet_ptr](int i, int j) {
+#ifdef PADDLE_WITH_PSLIB
+    size_t key_size = local_dim_keys[i][j].size();
+    int32_t status = -1;
+    int32_t cnt = 0;
+    while (true) {
+      auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
+          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
+          local_dim_keys[i][j].data(), key_size);
+      bool flag = true;
+
+      tt.wait();
+
+      try {
+        status = tt.get();
+      } catch (const std::future_error& e) {
+        VLOG(0) << "Caught a future_error with code" << e.code()
+                << ", Message:" << e.what();
+      }
+      if (status != 0) {
+        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
+        sleep(sleep_seconds_before_fail_exit_);
+        flag = false;
+        cnt++;
+      }
+      if (cnt > 3) {
+        VLOG(0) << "fleet pull sparse failed, retry 3 times";
+        exit(-1);
+      }
+
+      if (flag) {
+        break;
+      }
+    }
+    if (status != 0) {
+      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
+      sleep(300);
+      exit(-1);
+    } else {
+      VLOG(0) << "FleetWrapper Pull sparse to local done with table size: "
+              << local_dim_keys[i][j].size();
+    }
+#endif
+  };
+  if (!multi_mf_dim_) {
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i] = std::thread(ptl_func, i);
+    }
+  } else {
+    threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i * multi_mf_dim_ + j] = std::thread(ptl_dynamic_mf_func, i, j);
+      }
+    }
   }
   for (std::thread& t : threads) {
     t.join();
@@ -312,6 +453,37 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
         table_id_, pass_id, pass_values);
   }
 #endif
+  auto build_dynamic_mf_func = [this, device_num, &local_dim_keys,
+                                &local_dim_ptr, &device_dim_keys,
+                                &device_dim_ptr,
+                                &device_dim_mutex](int i, int j) {
+#ifdef PADDLE_WITH_PSLIB
+    std::vector<std::vector<FeatureKey>> task_keys(device_num);
+    std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
+        device_num);
+    for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) {
+      int shard = local_dim_keys[i][j][k] % device_num;
+      task_keys[shard].push_back(local_dim_keys[i][j][k]);
+      task_ptrs[shard].push_back(local_dim_ptr[i][j][k]);
+    }
+    for (int dev = 0; dev < device_num; dev++) {
+      for (int dim = 0; dim < multi_mf_dim_; dim++) {
+        device_dim_mutex[dev][dim]->lock();
+
+        int len = task_keys[dev].size();
+        int cur = device_dim_keys[dev][dim].size();
+        device_dim_keys[dev][dim].resize(device_dim_keys[dev][dim].size() +
+                                         len);
+        device_dim_ptr[dev][dim].resize(device_dim_ptr[dev][dim].size() + len);
+        for (int k = 0; k < len; ++k) {
+          device_dim_keys[dev][dim][cur + k] = task_keys[dev][k];
+          device_dim_ptr[dev][dim][cur + k] = task_ptrs[dev][k];
+        }
+        device_dim_mutex[dev][dim]->unlock();
+      }
+    }
+#endif
+  };
   auto build_func = [device_num, record_status, &pass_values, &local_keys,
                      &local_ptr, &device_keys, &device_vals,
                      &device_mutex](int i) {
@@ -415,8 +587,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     }
   };
 
-  for (size_t i = 0; i < threads.size(); i++) {
-    threads[i] = std::thread(build_func, i);
+  if (!multi_mf_dim_) {
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i] = std::thread(build_func, i);
+    }
+  } else {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i * multi_mf_dim_ + j] =
+            std::thread(build_dynamic_mf_func, i, j);
+      }
+    }
   }
   for (std::thread& t : threads) {
     t.join();
@@ -433,10 +614,21 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
 
   std::vector<size_t> feature_keys_count(device_num);
   size_t size_max = 0;
-  for (int i = 0; i < device_num; i++) {
-    feature_keys_count[i] = gpu_task->device_keys_[i].size();
-    VLOG(1) << i << " card contains feasign nums: " << feature_keys_count[i];
-    size_max = std::max(size_max, feature_keys_count[i]);
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < device_num; i++) {
+      feature_keys_count[i] = gpu_task->device_keys_[i].size();
+      VLOG(1) << i << " card contains feasign nums: " << feature_keys_count[i];
+      size_max = std::max(size_max, feature_keys_count[i]);
+    }
+  } else {
+    for (int i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size();
+      }
+      VLOG(1) << i << " card with dynamic mf contains feasign nums: "
+              << feature_keys_count[i];
+      size_max = std::max(size_max, feature_keys_count[i]);
+    }
   }
   if (HeterPs_) {
     delete HeterPs_;
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index c163c2de11019..c904e3557af27 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_context.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -48,6 +49,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+#define TYPEALIGN(ALIGNVAL, LEN) \
+  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
+
 class PSGPUWrapper {
  public:
   virtual ~PSGPUWrapper() { delete HeterPs_; }
@@ -261,6 +265,44 @@ class PSGPUWrapper {
     slot_vector_ = slot_vector;
   }
 
+  void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) {
+    slot_offset_vector_ = slot_offset_vector;
+  }
+
+  void SetSlotDimVector(const std::vector<int>& slot_mf_dim_vector) {
+    slot_mf_dim_vector_ = slot_mf_dim_vector;
+    assert(slot_mf_dim_vector_.size() == slot_vector_.size());
+    for (size_t i = 0; i < slot_mf_dim_vector.size(); i++) {
+      slot_dim_map_[slot_vector_[i]] = slot_mf_dim_vector_[i];
+    }
+
+    std::unordered_set<int> dims_set;
+    for (auto& it : slot_dim_map_) {
+      dims_set.insert(it.second);
+    }
+    size_t num_of_dim = dims_set.size();
+    index_dim_vec_.resize(num_of_dim);
+    index_dim_vec_.assign(dims_set.begin(), dims_set.end());
+    std::sort(index_dim_vec_.begin(), index_dim_vec_.end());
+    std::unordered_map<int, int> dim_index_map;
+    for (size_t i = 0; i < num_of_dim; i++) {
+      dim_index_map[index_dim_vec_[i]] = i;
+    }
+    hbm_pools_.resize(resource_->total_gpu() * num_of_dim);
+    mem_pools_.resize(resource_->total_gpu() * num_of_dim);
+    max_mf_dim_ = index_dim_vec_.back();
+    multi_mf_dim_ = (dim_index_map.size() >= 1) ? dim_index_map.size() : 0;
+    resource_->set_multi_mf(multi_mf_dim_, max_mf_dim_);
+    slot_index_vec_.resize(slot_mf_dim_vector_.size());
+    for (size_t i = 0; i < slot_index_vec_.size(); i++) {
+      slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]];
+    }
+    val_type_size_ =
+        TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
+    grad_type_size_ =
+        TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+  }
+
   void ShowOneTable(int index) { HeterPs_->show_one_table(index); }
 
  private:
@@ -274,6 +316,15 @@ class PSGPUWrapper {
   std::shared_ptr<HeterPsResource> resource_;
   int32_t sleep_seconds_before_fail_exit_;
   std::vector<int> slot_vector_;
+  std::vector<int> slot_offset_vector_;
+  std::vector<int> slot_mf_dim_vector_;
+  std::unordered_map<int, int> slot_dim_map_;
+  std::vector<int> slot_index_vec_;
+  std::vector<int> index_dim_vec_;
+  int multi_mf_dim_{0};
+  int max_mf_dim_{0};
+  size_t val_type_size_{0};
+  size_t grad_type_size_{0};
   int multi_node_{0};
   int node_size_;
   uint64_t table_id_;
@@ -284,6 +335,8 @@ class PSGPUWrapper {
   std::unordered_set<std::string> gpu_ps_config_keys_;
   HeterObjectPool<HeterContext> gpu_task_pool_;
   std::vector<std::vector<robin_hood::unordered_set<uint64_t>>> thread_keys_;
+  std::vector<std::vector<std::vector<robin_hood::unordered_set<uint64_t>>>>
+      thread_dim_keys_;
   int thread_keys_thread_num_ = 37;
   int thread_keys_shard_num_ = 37;
   uint64_t max_fea_num_per_pass_ = 5000000000;
@@ -291,6 +344,10 @@ class PSGPUWrapper {
   int month_;
   int day_;
 
+  std::vector<MemoryPool*> mem_pools_;
+  std::vector<HBMMemoryPool*> hbm_pools_;  // in multi mfdim, one table need hbm
+                                           // pools of totol dims number
+
   std::shared_ptr<
       paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
       data_ready_channel_ =
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index 93b7869cc1d25..3ed886e874db0 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -122,7 +122,8 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
 #endif
 
 #ifdef PADDLE_WITH_XPU
-  xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+  auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+  platform::XPUDeviceGuard guard(dev_id);
 #endif
 
   auto& block = program.Block(0);
@@ -338,22 +339,23 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
         auto dev_id =
             BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
         platform::CUDADeviceGuard guard(dev_id);
-        cudaMemset(thread_tensor->data<void>(), 0,
+        cudaMemset(thread_tensor->data(), 0,
                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
 #endif
 #ifdef PADDLE_WITH_XPU
         auto place = thread_tensor->place();
-        xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+        auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+        platform::XPUDeviceGuard guard(dev_id);
         platform::DeviceContextPool& pool =
             platform::DeviceContextPool::Instance();
         platform::DeviceContext* dev_ctx = pool.Get(place);
         const platform::XPUDeviceContext* xpu_ctx =
             reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
-        xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
+        xpu::memset(xpu_ctx->x_context(), thread_tensor->data(), 0,
                     thread_tensor->numel() * SizeOfType(thread_tensor->type()));
 #endif
       } else {
-        memset(thread_tensor->data<void>(), 0,
+        memset(thread_tensor->data(), 0,
                thread_tensor->numel() * SizeOfType(thread_tensor->type()));
       }
     }
@@ -365,22 +367,23 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
       auto dev_id =
           BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
       platform::CUDADeviceGuard guard(dev_id);
-      cudaMemset(root_tensor->data<void>(), 0,
+      cudaMemset(root_tensor->data(), 0,
                  root_tensor->numel() * SizeOfType(root_tensor->type()));
 #endif
 #ifdef PADDLE_WITH_XPU
       auto place = root_tensor->place();
-      xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      platform::XPUDeviceGuard guard(dev_id);
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
       platform::DeviceContext* dev_ctx = pool.Get(place);
       const platform::XPUDeviceContext* xpu_ctx =
           reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
-      xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
+      xpu::memset(xpu_ctx->x_context(), root_tensor->data(), 0,
                   root_tensor->numel() * SizeOfType(root_tensor->type()));
 #endif
     } else {
-      memset(root_tensor->data<void>(), 0,
+      memset(root_tensor->data(), 0,
              root_tensor->numel() * SizeOfType(root_tensor->type()));
     }
   }
@@ -416,7 +419,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
   std::shared_ptr<HeterServiceContext> context = object_pool_.Get();
 
   if (!context->scope_) {
-    int num = rand() % places_.size();
+    int num = rand_r() % places_.size();
     context->place_num_ = num;
     auto place = places_[num];
     context->scope_ = &(place_scopes_[num]->NewScope());
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index af6773042b678..6cd16132c2a10 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -130,7 +130,7 @@ ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW" /*, "NHWC", "AnyLayout"*/})
+      .IsStringIn({"NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("affine_channel"))
@@ -148,7 +148,7 @@ ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
       .IsTensor()
       .End()
       .AddAttr("data_layout")
-      .IsStringIn({"NCHW" /*, "NHWC", "AnyLayout"*/})
+      .IsStringIn({"NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -197,6 +197,13 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
 
     GET_CONV_BN_NODES(conv_ac_pattern);
 
+    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
+    if (data_format == "AnyLayout") {
+      LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
+                                 "it's wrong if data_format of conv is not "
+                                 "NCHW.";
+    }
+
     // Get affine_channel bias for resizing eltwise_y!
     auto* ac_bias_tensor =
         scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
@@ -282,7 +289,7 @@ ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW" /*, "NHWC", "AnyLayout"*/})
+      .IsStringIn({"NCHW", "AnyLayout"})
       .End();
   AddOpCompat(OpCompat("affine_channel"))
       .AddInput("X")
@@ -299,7 +306,7 @@ ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
       .IsTensor()
       .End()
       .AddAttr("data_layout")
-      .IsStringIn({"NCHW" /*, "NHWC", "AnyLayout"*/})
+      .IsStringIn({"NCHW", "AnyLayout"})
       .End();
   AddOpCompat(OpCompat("elementwise_add"))
       .AddInput("X")
@@ -347,6 +354,12 @@ void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
     VLOG(4) << "handle ConvBN fuse";
 
     GET_CONV_BN_NODES(conv_ac_pattern);
+    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
+    if (data_format == "AnyLayout") {
+      LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is "
+                                 "enabled, it's wrong if data_format of conv "
+                                 "is not NCHW.";
+    }
     // OPERATORS
     GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
     // BIAS inputs
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index 3d1c1eb55aa07..27e52167f3137 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -77,7 +77,7 @@ ConvElementwiseAdd2ActFusePass::ConvElementwiseAdd2ActFusePass() {
       .AddAttr("dilations")
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NHWC", "NCHW"})
+      .IsStringIn({"NHWC", "NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 439b85ffb9f10..545e4a7b9e616 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -57,7 +57,7 @@ ConvElementwiseAddFusePass::ConvElementwiseAddFusePass() {
       .AddAttr("dilations")
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW" /*, "NHWC", "AnyLayout"*/})
+      .IsStringIn({"NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -97,6 +97,13 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_NODES;
 
     auto base_op_desc = *conv_op->Op()->Proto();
+    auto data_format =
+        conv_op->Op()->GetAttrIfExists<std::string>("data_format");
+    if (data_format == "AnyLayout") {
+      LOG_FIRST_N(WARNING, 1) << "conv_elementwise_add_fuse_pass is enabled, "
+                                 "it's wrong if data_format of conv is not "
+                                 "NCHW.";
+    }
     std::string bias_name = elementwise_add_in_y->Name();
     std::string output_name = elementwise_add_out->Name();
 
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index 52e88c6408b0e..53cd2335fe23f 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -31,6 +31,22 @@ namespace ir {
 class Node;
 
 MulGRUFusePass::MulGRUFusePass() {
+  AddOpCompat(OpCompat("mul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("x_num_col_dims")
+      .IsNumEQ(1)
+      .End()
+      .AddAttr("y_num_col_dims")
+      .IsNumEQ(1)
+      .End();
   AddOpCompat(OpCompat("gru"))
       .AddInput("Input")
       .IsTensor()
@@ -58,10 +74,10 @@ MulGRUFusePass::MulGRUFusePass() {
       .IsTensor()
       .End()
       .AddAttr("activation")
-      .IsStringIn({"sigmoid", "tanh", "relu", "identity"})
+      .IsStringIn({"sigmoid", "tanh"})
       .End()
       .AddAttr("gate_activation")
-      .IsStringIn({"sigmoid", "tanh", "relu", "identity"})
+      .IsStringIn({"sigmoid", "tanh"})
       .End()
       .AddAttr("is_reverse")
       .IsType<bool>()
@@ -70,22 +86,6 @@ MulGRUFusePass::MulGRUFusePass() {
       .IsType<bool>()
       .IsOptional()
       .End();
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumEQ(1)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
 }
 
 FCGRUFusePass::FCGRUFusePass() {
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index d72b626fc1ebc..b99e607f92b5d 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -68,13 +68,13 @@ MulLstmFusePass::MulLstmFusePass() {
       .IsType<bool>()
       .End()
       .AddAttr("gate_activation")
-      .IsStringIn({"sigmoid", "tanh", "relu", "identity"})
+      .IsStringIn({"sigmoid"})
       .End()
       .AddAttr("cell_activation")
-      .IsStringIn({"sigmoid", "tanh", "relu", "identity"})
+      .IsStringIn({"tanh", "relu", "identity"})
       .End()
       .AddAttr("candidate_activation")
-      .IsStringIn({"sigmoid", "tanh", "relu", "identity"})
+      .IsStringIn({"tanh", "relu", "identity"})
       .End();
   AddOpCompat(OpCompat("mul"))
       .AddInput("X")
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
old mode 100755
new mode 100644
index 314f791da4f46..8c4965fc40235
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -924,6 +924,7 @@ PDNode *patterns::SeqConvEltAddRelu::operator()(
   seqconv_input->assert_is_op_input("sequence_conv", "X");
   auto *seqconv_op = pattern->NewNode(seqconv_repr())
                          ->assert_is_op("sequence_conv")
+                         ->assert_has_n_inputs(2)
                          ->assert_op_attr<bool>("paddingTrainable", false)
                          ->assert_op_attr<int>("contextStride", 1);
 
@@ -1640,6 +1641,32 @@ PDNode *patterns::Slice::operator()() {
   return slice_out;
 }
 
+PDNode *patterns::NearestInterp::operator()() {
+  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+
+  auto nearest_interp_op =
+      pattern->NewNode(nearest_interp_op_repr())
+          ->assert_is_ops({"nearest_interp", "nearest_interp_v2"});
+
+  auto nearest_interp_in =
+      pattern->NewNode(nearest_interp_in_repr())
+          ->AsInput()
+          ->assert_is_ops_input({"nearest_interp", "nearest_interp_v2"}, "X");
+  auto nearest_interp_out =
+      pattern->NewNode(nearest_interp_out_repr())
+          ->AsOutput()
+          ->assert_is_ops_output({"nearest_interp", "nearest_interp_v2"},
+                                 "Out");
+
+  auto next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  prev_op->LinksTo({nearest_interp_in});
+  nearest_interp_op->LinksFrom({nearest_interp_in})
+      .LinksTo({nearest_interp_out});
+  next_op->LinksFrom({nearest_interp_out});
+  return nearest_interp_out;
+}
+
 PDNode *patterns::Matmul::operator()() {
   auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul");
 
@@ -2375,15 +2402,8 @@ PDNode *patterns::MultipleQuantize::operator()() {
 
 PDNode *patterns::QuantizePlacement::operator()(
     const std::unordered_set<std::string> &quantize_enabled_op_types) {
-  std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
-                                       "fc", "matmul", "pool2d", "prior_box",
-                                       "reshape2", "transpose2", "fusion_gru",
-                                       "fusion_lstm", "multi_gru", "slice"});
-  if (!quantize_enabled_op_types.empty()) {
-    supported_op_types = quantize_enabled_op_types;
-  }
-  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  auto *op =
+      pattern->NewNode(op_repr())->assert_is_ops(quantize_enabled_op_types);
   return op;
 }
 
@@ -2421,11 +2441,13 @@ PDNode *patterns::Bfloat16Placement::operator()(
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
+  auto *op_in = pattern->NewNode(op_in_repr())->AsInput();
   auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
   op->assert_more([&](Node *node) {
     return node->Op()->GetAttrIfExists<bool>("use_mkldnn") ||
            node->Op()->Type() == "reshape2";
   });
+  op->LinksFrom({op_in});
   return op;
 }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index deaba36ba5da2..5b996a3ab918b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -995,6 +995,21 @@ struct Slice : public PatternBase {
   PATTERN_DECL_NODE(next_op);
 };
 
+// Nearest Interp op
+// Forward pass for nearest_interp.
+// nearest_interp_out is a result of the operator.
+struct NearestInterp : public PatternBase {
+  NearestInterp(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "nearest_interp") {}
+
+  PDNode* operator()();
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(nearest_interp_in);
+  PATTERN_DECL_NODE(nearest_interp_op);
+  PATTERN_DECL_NODE(nearest_interp_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
 // Matmul op
 // Forward pass for matmul.
 struct Matmul : public PatternBase {
@@ -1431,6 +1446,7 @@ struct Bfloat16Placement : public PatternBase {
   PDNode* operator()(
       const std::unordered_set<std::string>& bfloat16_enabled_op_types);
 
+  PATTERN_DECL_NODE(op_in);
   PATTERN_DECL_NODE(op);
 };
 
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
index 865b556f301c0..734f8957ad09e 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -169,9 +169,6 @@ Flatten2MatmulFusePass::Flatten2MatmulFusePass() {
       .AddInput("X")
       .IsTensor()
       .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
       .AddOutput("Out")
       .IsTensor()
       .End()
@@ -179,7 +176,7 @@ Flatten2MatmulFusePass::Flatten2MatmulFusePass() {
       .IsTensor()
       .End()
       .AddAttr("axis")
-      .IsNumGE(0)
+      .IsNumEQ(1)
       .End();
 
   AddOpCompat(OpCompat("mul"))
@@ -222,7 +219,7 @@ Squeeze2MatmulFusePass::Squeeze2MatmulFusePass() {
       .IsBoolEQ(false)
       .End();
 
-  AddOpCompat(OpCompat("Squeeze2"))
+  AddOpCompat(OpCompat("squeeze2"))
       .AddInput("X")
       .IsTensor()
       .End()
@@ -593,10 +590,10 @@ Reshape2MatmulFusePass::Reshape2MatmulFusePass() {
       .IsNumLT(1.00001f)
       .End()
       .AddAttr("transpose_X")
-      .IsBoolEQ("False")
+      .IsBoolEQ(false)
       .End()
       .AddAttr("transpose_Y")
-      .IsBoolEQ("False")
+      .IsBoolEQ(false)
       .End();
 
   AddOpCompat(OpCompat("mul"))
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
index c5bb4bf0b2fc9..9f6cd8992dcb9 100644
--- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
@@ -108,15 +108,6 @@ void FuseBatchNormActOneDNNPass::FuseBatchNormAct(
     GET_IR_NODE_FROM_SUBGRAPH(act, act, bn_act_pattern);
 
     auto *bn_op = batch_norm->Op();
-
-    if (bn_op->HasAttr("use_mkldnn")) {
-      PADDLE_ENFORCE(
-          BOOST_GET_CONST(bool, bn_op->GetAttr("use_mkldnn")),
-          platform::errors::PreconditionNotMet(
-              "The BatchNorm+Act fusion may happen only when oneDNN library "
-              "is used."));
-    }
-
     if (bn_op->HasAttr("trainable_statistics")) {
       PADDLE_ENFORCE(
           !BOOST_GET_CONST(bool, bn_op->GetAttr("trainable_statistics")),
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index 8255a40a2c0ca..0d0151fb738db 100755
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -157,7 +157,7 @@ ConvActivationFusePass::ConvActivationFusePass() {
       // IsStringIn({"NHWC", "NCHW"}) MobileNetV2 has no this attribute
       .AddAttr("data_format")
       .IsOptional()
-      .IsStringIn({"NHWC", "NCHW", "AnyLayout"})
+      .IsStringIn({"NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("relu"))
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index 8031f56752ac8..c537d05738529 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -117,7 +117,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
+      .IsStringIn({"NCHW", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -131,7 +131,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() {
       .IsTensor()
       .End()
       .AddAttr("axis")
-      .IsIntIn({-1, 0})
+      .IsIntIn({-1, 0, 1})
       .End();
 }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
index 0f9edeba525b0..d89891ec3c857 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -41,8 +41,12 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_placement_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_placement_pattern);
 
+    // Only float input can be converted to bfloat16
+    if (op_in->Var()->GetDataType() != proto::VarType::FP32) return;
+
     if ((op->Op()->HasAttr("mkldnn_data_type") ||
          op->Op()->HasProtoAttr("mkldnn_data_type")) &&
         !platform::HasOpINT8DataType(op->Op())) {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
index 28a45f36fb71d..e3ef7b7af05d2 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -68,7 +68,7 @@ ProgramDesc BuildProgramDesc() {
   for (auto& v :
        std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l", "m",
                                  "n", "o", "p", "r", "s"})) {
-    prog.MutableBlock(0)->Var(v);
+    prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
   }
 
   SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
@@ -86,9 +86,8 @@ ProgramDesc BuildProgramDesc() {
 }
 
 void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
-              unsigned expected_bfloat16_data_type_count) {
-  auto prog = BuildProgramDesc();
-
+              unsigned expected_bfloat16_data_type_count,
+              const ProgramDesc& prog) {
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
 
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
@@ -110,8 +109,8 @@ void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
   EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
 }
 
-void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
-  auto prog = BuildProgramDesc();
+void DefaultAttrTest(unsigned expected_bfloat16_data_type_count,
+                     const ProgramDesc& prog) {
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
   graph.reset(pass->Apply(graph.release()));
@@ -128,15 +127,39 @@ void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
 }
 
 TEST(Bfloat16PlacementPass, enable_all) {
-  MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8);
+  MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8,
+           BuildProgramDesc());
 }
 
 TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
   // 2 conv2d + 2 pool2 - 1 orphaned conv2d
-  MainTest({"conv2d", "pool2d"}, 3);
+  MainTest({"conv2d", "pool2d"}, 3, BuildProgramDesc());
+}
+
+TEST(Bfloat16PlacementPass, default_attr_value) {
+  DefaultAttrTest(10, BuildProgramDesc());
+}
+
+ProgramDesc BuildProgramDescWithDataType() {
+  ProgramDesc prog;
+
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e"})) {
+    if (v == "a") {
+      prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::INT32);
+    } else {
+      prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "conv1", {"a"}, {"b"});
+  SetOp(&prog, "pool2d", "pool1", {"b"}, {"c"});
+  SetOp(&prog, "concat", "concat1", {"c", "d"}, {"e"});
+  return prog;
 }
 
-TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(10); }
+TEST(Bfloat16PlacementPass, check_data_types) {
+  DefaultAttrTest(2, BuildProgramDescWithDataType());
+}
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 3df4a84470524..64d9bf603533e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -1053,6 +1053,67 @@ void CPUQuantizePass::QuantizeFusionLSTM(Graph* graph) const {
   PrettyLogDetail("---    quantized %d fusion_lstm ops", quantize_count);
 }
 
+void CPUQuantizePass::QuantizeNearestInterp(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::NearestInterp nearest_interp_pattern{pattern, name_scope_};
+  nearest_interp_pattern();
+
+  int quantize_nearest_interp_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize nearest_interp op";
+    GET_IR_NODE_FROM_SUBGRAPH(nearest_interp_op, nearest_interp_op,
+                              nearest_interp_pattern);
+
+    // skip if should not be quantized
+    if (!platform::HasOpINT8DataType(nearest_interp_op->Op())) {
+      LogQuantizationDisabled(nearest_interp_op);
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, nearest_interp_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, nearest_interp_pattern);
+
+    // skip if prev op and next op is not quantized
+    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(next_op))) {
+      LogCannotQuantizeOp(nearest_interp_op,
+                          "There are no other quantized operators nearby, so "
+                          "quantization is not recommended.");
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(nearest_interp_in, nearest_interp_in,
+                              nearest_interp_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(nearest_interp_out, nearest_interp_out,
+                              nearest_interp_pattern);
+
+    if (!AreScalesPresentForNodes({nearest_interp_in, nearest_interp_out})) {
+      LogCannotQuantizeOp(nearest_interp_op);
+      return;
+    }
+
+    bool is_input_unsigned{false};
+    auto input_scale =
+        GetScaleValueForNode(nearest_interp_in, &is_input_unsigned);
+    QuantizeInput(g, nearest_interp_op, nearest_interp_in, "X", input_scale,
+                  is_input_unsigned);
+
+    bool is_output_unsigned{false};
+    auto output_scale =
+        GetScaleValueForNode(nearest_interp_out, &is_output_unsigned);
+    DequantizeOutput(g, nearest_interp_op, nearest_interp_out, "Out",
+                     output_scale, is_output_unsigned);
+
+    ++quantize_nearest_interp_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_nearest_interp_count);
+
+  PrettyLogDetail("---    quantized %d nearest_interp ops",
+                  quantize_nearest_interp_count);
+}
+
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
   PADDLE_ENFORCE_NOT_NULL(
@@ -1076,6 +1137,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
   QuantizeSlice(graph);
+  QuantizeNearestInterp(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index b3ee98263c0c0..412c4e40a01d5 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -62,6 +62,7 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
   void QuantizeSlice(Graph* graph) const;
+  void QuantizeNearestInterp(Graph* graph) const;
 
   void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                      double scale_to_one, bool is_input_unsigned,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 838912f659ff7..e7c236bc489b7 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"  // NOLINT
 #include <gtest/gtest.h>
+#include <unordered_map>
 
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -23,6 +24,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+static float const SCALE = 2.f;
+static int const S8_MAX = 127;
+static int const U8_MAX = 255;
+
 void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
            const std::vector<std::string>& inputs,
            const std::vector<std::string>& outputs, bool use_mkldnn,
@@ -31,6 +36,9 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
   op->SetType(type);
   op->SetAttr("use_mkldnn", use_mkldnn);
   op->SetAttr("name", name);
+  if (type != "dropout" || type != "quantize" || type != "dequantize") {
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  }
 
   if (type == "conv2d") {
     op->SetInput("Input", {inputs[0]});
@@ -47,18 +55,16 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
       op->SetAttr("fuse_residual_connection", false);
     }
     op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_in", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
     op->SetAttr("Scale_weights", std::vector<float>{1.0f});
-  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2") {
+  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2" ||
+             type == "nearest_interp" || type == "nearest_interp_v2") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
   } else if (type == "slice") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
   } else if (type == "dropout") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
@@ -67,14 +73,12 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
     if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_in", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
     op->SetAttr("Scale_weights", std::vector<float>{1.0f});
   } else if (type == "concat") {
     op->SetInput("X", inputs);
     op->SetOutput("Out", outputs);
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
   } else if (type == "dequantize") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Output", {outputs[0]});
@@ -83,7 +87,6 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
@@ -91,7 +94,6 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
@@ -101,7 +103,6 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("WeightX", {inputs[2]});
     op->SetInput("WeightH", {inputs[3]});
     op->SetOutput("Hidden", {outputs[0]});
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_data", 1.0f);
     op->SetAttr("Shift_data", 0.0f);
     op->SetAttr("Weight_scale", std::vector<float>{1.0f});
@@ -114,7 +115,6 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetOutput("Hidden", {outputs[0]});
     op->SetOutput("Cell", {outputs[1]});
 
-    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
     op->SetAttr("Scale_data", 1.0f);
     op->SetAttr("Shift_data", 0.0f);
     op->SetAttr("Weight_scale", std::vector<float>{1.0f});
@@ -144,7 +144,7 @@ void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
     LoDTensor tensor;
     tensor.Resize({1});
     auto* ptr = tensor.mutable_data<double>(place);
-    ptr[0] = 2.0;
+    ptr[0] = SCALE;
     (*scales)[v] = std::make_pair(v == var_signed, std::move(tensor));
   }
 
@@ -158,7 +158,57 @@ void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
   *current_nodes_num = (*graph)->Nodes().size();
 }
 
-namespace {
+void CheckScales(const OpDesc* op, float scale, float shift) {
+  std::string type = op->Type();
+  std::vector<std::string> scale_names;
+  if (type == "conv2d" || type == "fc") {
+    EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Scale_weights")[0],
+              scale);
+    scale_names.push_back("Scale_in");
+    scale_names.push_back("Scale_out");
+  } else if (type == "matmul" || type == "elementwise_add") {
+    scale_names.push_back("Scale_x");
+    scale_names.push_back("Scale_y");
+    scale_names.push_back("Scale_out");
+  } else if (type == "fusion_gru" || type == "fusion_lstm") {
+    EXPECT_EQ(op->GetAttrIfExists<float>("Shift_data"), shift);
+    EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Scale_weights")[0],
+              scale);
+    EXPECT_EQ(op->GetAttrIfExists<bool>("force_fp32_output"), true);
+    scale_names.push_back("Scale_data");
+  }
+
+  for (auto const& scale_name : scale_names) {
+    EXPECT_EQ(op->GetAttrIfExists<float>(scale_name), scale);
+  }
+}
+
+void MainTest(const ProgramDesc& prog,
+              const std::vector<std::string> variable_names,
+              std::unordered_map<std::string, int> expected_operators,
+              const int added_nodes_count, float scale = 1.f, float shift = 1.f,
+              std::string var_without_scale = "", std::string var_signed = "") {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names, &original_nodes_num,
+              &current_nodes_num, var_without_scale, var_signed);
+  std::unordered_map<std::string, int> actual_operators;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (expected_operators.count(op->Type()) > 0) {
+        expected_operators[op->Type()]--;
+        if (op->GetAttrIfExists<std::string>("mkldnn_data_type") == "int8")
+          CheckScales(op, scale, shift);
+      }
+    }
+  }
+  for (auto const& pair : expected_operators) {
+    EXPECT_EQ(pair.second, 0);
+  }
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
 static const std::initializer_list<std::string> variable_names{
     "a",  "w1", "c", "d", "w2", "e",  "f",  "g", "h",
     "w3", "b1", "i", "j", "w4", "b2", "w5", "b3"};
@@ -199,48 +249,6 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn,
   return prog;
 }
 
-void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
-              int quant_count, int dequant_count, int added_nodes_count,
-              float scale) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names, &original_nodes_num,
-              &current_nodes_num);
-
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int conv2d_nodes_count = 0;
-  int pool2d_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "conv2d") {
-        conv2d_nodes_count++;
-        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_in")), scale)
-            << "Scale_in for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_out")), scale)
-            << "Scale_out for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(std::vector<float>,
-                                  op->GetAttr("Scale_weights"))[0],
-                  scale)
-            << "Scale_weights for node '" + op_name + "'.";
-      } else if (op->Type() == "pool2d") {
-        pool2d_nodes_count++;
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
-  }
-  EXPECT_EQ(conv2d_nodes_count, conv_count);
-  EXPECT_EQ(pool2d_nodes_count, pool_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
 TEST(CpuQuantizePass, quantize) {
   bool use_mkldnn = true;
   std::string mkldnn_data_type = "int8";
@@ -256,16 +264,20 @@ TEST(CpuQuantizePass, quantize) {
   // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
   // Insert nodes: 8 Quant + 8 IN + 7 OUT + 7 DEQUANT
   int added_nodes = 8 + 8 + 7 + 7;
-  MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), 4, 2, 8, 7,
-           added_nodes, 2.0f * 127);
+  std::unordered_map<std::string, int> expected_operators = {
+      {"conv2d", 4}, {"pool2d", 2}, {"quantize", 8}, {"dequantize", 7}};
+  MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), variable_names,
+           expected_operators, added_nodes, SCALE * S8_MAX);
 }
 
 TEST(CpuQuantizePass, do_not_quantize) {
   bool use_mkldnn = true;
   std::string mkldnn_data_type = "float32";
   int added_nodes = 0;
-  MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), 4, 2, 0, 0,
-           added_nodes, 1.0f);
+  std::unordered_map<std::string, int> expected_operators = {
+      {"conv2d", 4}, {"pool2d", 2}, {"quantize", 0}, {"dequantize", 0}};
+  MainTest(BuildProgramDesc(use_mkldnn, mkldnn_data_type), variable_names,
+           expected_operators, added_nodes, 1.0f);
 }
 
 static const std::initializer_list<std::string> variable_names_concat = {
@@ -286,134 +298,16 @@ ProgramDesc BuildProgramDescConcat() {
   return prog;
 }
 
-void MainTestConcat(const ProgramDesc& prog, int pool_count, int concat_count,
-                    int quant_count, int dequant_count, int added_nodes_count) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names_concat, &original_nodes_num,
-              &current_nodes_num);
-
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int concat_nodes_count = 0;
-  int pool2d_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "concat") {
-        concat_nodes_count++;
-      } else if (op->Type() == "pool2d") {
-        pool2d_nodes_count++;
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
-  }
-  EXPECT_EQ(concat_nodes_count, concat_count);
-  EXPECT_EQ(pool2d_nodes_count, pool_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
 TEST(CpuQuantizePass, concat) {
   // a1->Pool1->b1
   // a2->Pool2->b2
   // (b1->QUANT1->IN1, b2->QUANT2->IN2)->Concat->c
   // c->OUT1->DEQUANT1->Pool3->d
-  int pool_count = 3;
-  int concat_count = 1;
-  int quant_count = 2;
-  int dequant_count = 1;
-  int added_nodes_count = 6;
-  MainTestConcat(BuildProgramDescConcat(), pool_count, concat_count,
-                 quant_count, dequant_count, added_nodes_count);
-}
-
-static const std::initializer_list<std::string> variable_names_transpose = {
-    "a", "w1", "b", "c", "w2", "d", "e", "f"};
-
-// a->Conv1->b
-// b->Transpose1->c
-// c->Conv2->d
-// d->Transpose2->e
-// e->Dropout->f
-ProgramDesc BuildProgramDescTranspose() {
-  ProgramDesc prog;
-  for (auto& v : variable_names_transpose) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    if (v.find("w") == 0) {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"b"}, true, "int8");
-  SetOp(&prog, "transpose2", "Transpose1", {"b"}, {"c"}, true, "int8");
-  SetOp(&prog, "conv2d", "Conv1", {"c", "w2"}, {"d"}, true, "int8");
-  SetOp(&prog, "transpose2", "Transpose2", {"d"}, {"e"}, true, "int8");
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
-
-  return prog;
-}
-
-void MainTestTranspose(const ProgramDesc& prog, int conv_count,
-                       int transpose_count, int quant_count, int dequant_count,
-                       int added_nodes_count, float scale) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names_transpose, &original_nodes_num,
-              &current_nodes_num);
-
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int transpose_nodes_count = 0;
-  int conv_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "transpose2") {
-        transpose_nodes_count++;
-      } else if (op->Type() == "conv2d") {
-        conv_nodes_count++;
-        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_in")), scale)
-            << "Scale_in for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_out")), scale)
-            << "Scale_out for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(std::vector<float>,
-                                  op->GetAttr("Scale_weights"))[0],
-                  scale)
-            << "Scale_weights for node '" + op_name + "'.";
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
-  }
-  EXPECT_EQ(transpose_nodes_count, transpose_count);
-  EXPECT_EQ(conv_nodes_count, conv_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
-TEST(CpuQuantizePass, transpose) {
-  // a1->Quant->a2->Conv1->b1->Dequant->b2
-  // b2->Quant->b3->Transpose->c1->Dequant->c2
-  // c2->Quant->c3->Conv2->d1->Dequant->d2
-  // d2->Quant->d3->Transpose->e1->Dequant->e2
-  // e2->Dropout->f
-  int conv_count = 2;
-  int transpose_count = 2;
-  int quant_count = 4;
-  int dequant_count = 4;
-  // 4 Quant + 4 IN + 4 DeQuant + 4 OUT
-  int added_nodes_count = 4 + 4 + 4 + 4;
-  MainTestTranspose(BuildProgramDescTranspose(), conv_count, transpose_count,
-                    quant_count, dequant_count, added_nodes_count, 2.0f * 127);
+  int added_nodes = 6;
+  std::unordered_map<std::string, int> expected_operators = {
+      {"pool2d", 3}, {"concat", 1}, {"quantize", 2}, {"dequantize", 1}};
+  MainTest(BuildProgramDescConcat(), variable_names_concat, expected_operators,
+           added_nodes);
 }
 
 static const std::initializer_list<std::string> variable_names_fusion_gru = {
@@ -422,7 +316,7 @@ static const std::initializer_list<std::string> variable_names_fusion_gru = {
 // (x, wx, wh, b)->Fusion_gru->h
 ProgramDesc BuildProgramDescFusionGru() {
   ProgramDesc prog;
-  for (auto& v : variable_names_transpose) {
+  for (auto& v : variable_names_fusion_gru) {
     auto* var = prog.MutableBlock(0)->Var(v);
     if (v.find("wx") == 0 || v.find("wh") || v.find("b")) {
       var->SetPersistable(true);
@@ -441,7 +335,7 @@ static const std::initializer_list<std::string> variable_names_fusion_lstm = {
 // (x, wx, wh, b)->Fusion_lstm_1->h
 ProgramDesc BuildProgramDescFusionLSTM() {
   ProgramDesc prog;
-  for (auto& v : variable_names_transpose) {
+  for (auto& v : variable_names_fusion_lstm) {
     auto* var = prog.MutableBlock(0)->Var(v);
     if (v.find("wx") == 0 || v.find("wh") || v.find("b")) {
       var->SetPersistable(true);
@@ -454,109 +348,192 @@ ProgramDesc BuildProgramDescFusionLSTM() {
   return prog;
 }
 
-void MainTestFusionGru(const ProgramDesc& prog, int gru_count, int quant_count,
-                       int dequant_count, int added_nodes_count, float scale,
-                       float shift) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names_fusion_gru, &original_nodes_num,
-              &current_nodes_num);
+TEST(CpuQuantizePass, fusion_gru) {
+  // (x, wx, wh, b)->Fusion_gru->h
 
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int gru_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "fusion_gru") {
-        gru_nodes_count++;
+  // 1 Quant + 1 IN + 0 DeQuant + 0 OUT
+  int added_nodes = 1 + 1 + 0 + 0;
+  std::unordered_map<std::string, int> expected_operators = {
+      {"fusion_gru", 1}, {"quantize", 1}, {"dequantize", 0}};
+  MainTest(BuildProgramDescFusionGru(), variable_names_fusion_gru,
+           expected_operators, added_nodes, SCALE * S8_MAX, 128);
+}
 
-        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_data")), scale)
-            << "Scale_data for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Shift_data")), shift)
-            << "Shift_data for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(std::vector<float>,
-                                  op->GetAttr("Scale_weights"))[0],
-                  scale)
-            << "Scale_weights for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(bool, op->GetAttr("force_fp32_output")), true)
-            << "force_fp32_output for node '" + op_name + "'.";
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
+TEST(CpuQuantizePass, fusion_lstm) {
+  // (x, wx, wh, b)->Fusion_lstm->h
+
+  // 1 Quant + 1 IN + 0 DeQuant + 0 OUT
+  int added_nodes = 1 + 1 + 0 + 0;
+  std::unordered_map<std::string, int> expected_operators = {
+      {"fusion_lstm", 1}, {"quantize", 1}, {"dequantize", 0}};
+  MainTest(BuildProgramDescFusionLSTM(), variable_names_fusion_lstm,
+           expected_operators, added_nodes, SCALE * S8_MAX, 128.);
+}
+
+static const std::initializer_list<std::string> variable_names_immutable_ops = {
+    "a", "w1", "b", "c", "d"};
+
+// a->Dequantize->b
+// b->Tested Op->c
+// c->Dropout->d
+void TestImmutableOp(const std::string tested_op) {
+  ProgramDesc prog;
+  for (auto& v : variable_names_immutable_ops) {
+    prog.MutableBlock(0)->Var(v);
   }
-  EXPECT_EQ(gru_nodes_count, gru_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
+  SetOp(&prog, tested_op, tested_op, {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
+
+  // a->Dequantize->b
+  // b2->Quant->b3->Tested Op->c1->Dequant->c2
+  // c2->Dropout->d
+  // 1 Quant + 1 IN + 1 DeQuant + 1 OUT
+  int added_nodes = 4;
+  std::unordered_map<std::string, int> expected_operators = {
+      {tested_op, 1}, {"quantize", 1}, {"dequantize", 2}};
+  MainTest(prog, variable_names_immutable_ops, expected_operators, added_nodes,
+           SCALE * S8_MAX);
 }
 
-TEST(CpuQuantizePass, fusion_gru) {
-  // (x, wx, wh, b)->Fusion_gru->h
-  int gru_count = 1;
-  int quant_count = 1;
-  int dequant_count = 0;
-  // 1 Quant + 1 IN + 0 DeQuant + 0 OUT
-  int added_nodes_count = 1 + 1 + 0 + 0;
-  MainTestFusionGru(BuildProgramDescFusionGru(), gru_count, quant_count,
-                    dequant_count, added_nodes_count, 2. * 127, 128.);
+// a->Dropout1->b
+// b->Tested Op->c
+// c->Dropout2->d
+void TestImmutableOpBetweenNonQuantizedOp(const std::string tested_op) {
+  ProgramDesc prog;
+  for (auto& v : variable_names_immutable_ops) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "dropout", "Dropout1", {"a"}, {"b"}, true, "float32");
+  SetOp(&prog, tested_op, tested_op, {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout2", {"c"}, {"d"}, true, "float32");
+
+  // 0 Quant + 0 IN + 0 DeQuant + 0 OUT
+  int added_nodes = 0;
+  std::unordered_map<std::string, int> expected_operators = {
+      {tested_op, 1}, {"dropout", 2}, {"quantize", 0}, {"dequantize", 0}};
+  MainTest(prog, variable_names_immutable_ops, expected_operators, added_nodes,
+           SCALE * S8_MAX);
 }
 
-void MainTestFusionLSTM(const ProgramDesc& prog, int expect_lstm_count,
-                        int quant_count, int dequant_count,
-                        int added_nodes_count, float scale, float shift) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names_fusion_lstm, &original_nodes_num,
-              &current_nodes_num);
+TEST(CpuQuantizePass, reshape2) { TestImmutableOp("reshape2"); }
 
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int lstm_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "fusion_lstm") {
-        lstm_nodes_count++;
+TEST(CpuQuantizePass, reshape2BetweenNonQuantizedOp) {
+  TestImmutableOpBetweenNonQuantizedOp("reshape2");
+}
 
-        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_data")), scale)
-            << "Scale_data for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Shift_data")), shift)
-            << "Shift_data for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(std::vector<float>,
-                                  op->GetAttr("Scale_weights"))[0],
-                  scale)
-            << "Scale_weights for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(bool, op->GetAttr("force_fp32_output")), true)
-            << "force_fp32_output for node '" + op_name + "'.";
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
+TEST(CpuQuantizePass, transpose2) { TestImmutableOp("transpose2"); }
+
+TEST(CpuQuantizePass, transpose2BetweenNonQuantizedOp) {
+  TestImmutableOpBetweenNonQuantizedOp("transpose2");
+}
+
+TEST(CpuQuantizePass, slice) { TestImmutableOp("slice"); }
+
+TEST(CpuQuantizePass, sliceBetweenNonQuantizedOp) {
+  TestImmutableOpBetweenNonQuantizedOp("slice");
+}
+
+TEST(CpuQuantizePass, nearestInterp) { TestImmutableOp("nearest_interp"); }
+
+TEST(CpuQuantizePass, nearestInterpBetweenNonQuantizedOp) {
+  TestImmutableOpBetweenNonQuantizedOp("nearest_interp");
+}
+
+TEST(CpuQuantizePass, nearestInterpV2) { TestImmutableOp("nearest_interp_v2"); }
+
+TEST(CpuQuantizePass, nearestInterpV2BetweenNonQuantizedOp) {
+  TestImmutableOpBetweenNonQuantizedOp("nearest_interp_v2");
+}
+
+static const std::initializer_list<std::string> variable_names_matmul = {
+    "a", "b", "c", "d", "e", "f"};
+
+ProgramDesc BuildProgramDescMatmul() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_matmul) {
+    prog.MutableBlock(0)->Var(v);
   }
-  EXPECT_EQ(lstm_nodes_count, expect_lstm_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
+  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
+  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
+
+  return prog;
 }
 
-TEST(CpuQuantizePass, fusion_lstm) {
-  // (x, wx, wh, b)->Fusion_lstm->h
-  int expect_lstm_count = 1;
-  int expect_quant_count = 1;
-  int dequant_count = 0;
-  // 1 Quant + 1 IN + 0 DeQuant + 0 OUT
-  int added_nodes_count = 1 + 1 + 0 + 0;
-  MainTestFusionLSTM(BuildProgramDescFusionLSTM(), expect_lstm_count,
-                     expect_quant_count, dequant_count, added_nodes_count,
-                     2. * 127, 128.);
+ProgramDesc BuildProgramDescMatmulNotQuantized() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_matmul) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
+  SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true);
+  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
+
+  return prog;
+}
+
+TEST(CpuQuantizePass, matmul) {
+  // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
+  int added_nodes = 6;
+  std::unordered_map<std::string, int> expected_operators = {
+      {"matmul", 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescMatmul(), variable_names_matmul, expected_operators,
+           added_nodes, SCALE * S8_MAX);
+}
+
+TEST(CpuQuantizePass, matmul_not_quantized) {
+  // nothing change
+  int added_nodes = 0;
+  std::unordered_map<std::string, int> expected_operators = {
+      {"matmul", 1}, {"quantize", 0}, {"dequantize", 1}};
+  MainTest(BuildProgramDescMatmulNotQuantized(), variable_names_matmul,
+           expected_operators, added_nodes, 1.0f);
+}
+
+static const std::initializer_list<std::string> variable_names_elementwise_add =
+    {"a", "b", "c", "d", "e", "f"};
+
+ProgramDesc BuildProgramDescElementwiseAdd() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_elementwise_add) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
+  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
+  SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
+        "int8");
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
+
+  return prog;
+}
+
+TEST(CpuQuantizePass, elementwise_add) {
+  // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
+  int added_nodes = 6;
+  std::unordered_map<std::string, int> expected_operators = {
+      {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
+           expected_operators, added_nodes, SCALE * S8_MAX);
+}
+
+TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+  int added_nodes = 0;
+  std::unordered_map<std::string, int> expected_operators = {
+      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
+           expected_operators, added_nodes, 1.f, 1.f, "e");
+}
+
+TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+  int added_nodes = 0;
+  std::unordered_map<std::string, int> expected_operators = {
+      {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add,
+           expected_operators, added_nodes, 1.f, 1.f, "", "b");
 }
 
 const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
@@ -681,395 +658,6 @@ TEST(CpuQuantizePass, multi_gru_3) {
   MainTestMultiGru(layers);
 }
 
-static const std::initializer_list<std::string> variable_names_reshape = {
-    "a", "w1", "b", "c", "d", "e", "f"};
-
-// a->Dequantize->b
-// b->Reshape->c
-// c->Dropout->d
-ProgramDesc BuildProgramDescReshape() {
-  ProgramDesc prog;
-  for (auto& v : variable_names_reshape) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
-  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, "int8");
-  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
-
-  return prog;
-}
-
-// a->Transpose->b
-// b->Reshape->c
-// c->Dropout->d
-ProgramDesc BuildProgramDescReshapeBetweenNonQuantizedOp() {
-  ProgramDesc prog;
-  for (auto& v : variable_names_reshape) {
-    prog.MutableBlock(0)->Var(v);
-  }
-
-  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, "float32");
-  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, "int8");
-  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
-
-  return prog;
-}
-
-void MainTestReshape(const ProgramDesc& prog, int transpose_count,
-                     int reshape_count, int quant_count, int dequant_count,
-                     int added_nodes_count, float scale) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names_reshape, &original_nodes_num,
-              &current_nodes_num);
-
-  float quant_scale = 1.0f;
-  float dequant_scale = 1.0f;
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int transpose_nodes_count = 0;
-  int reshape_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "transpose2") {
-        transpose_nodes_count++;
-      } else if (op->Type() == "reshape2") {
-        reshape_nodes_count++;
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-        quant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale"));
-        EXPECT_EQ(quant_scale, scale) << "Scale for node '" + op->Type() + "'.";
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-        auto op_name = op->GetAttrIfExists<std::string>("name");
-        VLOG(3) << op_name << "\n";
-        if (op_name != "Dequantize1") {
-          dequant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale"));
-          EXPECT_EQ(dequant_scale, scale)
-              << "Scale for node '" + op->Type() + "'.";
-        }
-      }
-    }
-  }
-  EXPECT_EQ(transpose_nodes_count, transpose_count);
-  EXPECT_EQ(reshape_nodes_count, reshape_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
-TEST(CpuQuantizePass, reshape) {
-  // a->Dequantize->b
-  // b2->Quant->b3->Reshape2->c1->Dequant->c2
-  // c2->Dropout->d
-  int reshape_count = 1;
-  int transpose_count = 0;
-  int quant_count = 1;
-  int dequant_count = 2;
-  // 1 Quant + 1 IN + 1 DeQuant + 1 OUT
-  int added_nodes_count = 4;
-  MainTestReshape(BuildProgramDescReshape(), transpose_count, reshape_count,
-                  quant_count, dequant_count, added_nodes_count, 2.0f * 127);
-}
-
-TEST(CpuQuantizePass, reshapeBetweenNonQuantizedOp) {
-  // a->Transpos2->b
-  // b->Reshape2->c
-  // c->Dropout->d
-  int reshape_count = 1;
-  int transpose_count = 1;
-  int quant_count = 0;
-  int dequant_count = 0;
-  // 0 Quant + 0 IN + 0 DeQuant + 0 OUT
-  int added_nodes_count = 0;
-  MainTestReshape(BuildProgramDescReshapeBetweenNonQuantizedOp(),
-                  transpose_count, reshape_count, quant_count, dequant_count,
-                  added_nodes_count, 2.0f * 127);
-}
-
-static const std::initializer_list<std::string> variable_names_slice = {
-    "a", "b", "c", "d"};
-
-// a->Dequantize->b
-// b->Slice->c
-// c->Dropout->d
-ProgramDesc BuildProgramDescSlice() {
-  ProgramDesc prog;
-  for (auto& v : variable_names_slice) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
-  SetOp(&prog, "slice", "Slice", {"b"}, {"c"}, true, "int8");
-  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
-
-  return prog;
-}
-
-// a->Transpose->b
-// b->slice->c
-// c->Dropout->d
-ProgramDesc BuildProgramDescSliceBetweenNonQuantizedOp() {
-  ProgramDesc prog;
-  for (auto& v : variable_names_slice) {
-    prog.MutableBlock(0)->Var(v);
-  }
-
-  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, "float32");
-  SetOp(&prog, "slice", "Slice", {"b"}, {"c"}, true, "int8");
-  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
-
-  return prog;
-}
-
-void MainTestSlice(const ProgramDesc& prog, int transpose_count,
-                   int slice_count, int quant_count, int dequant_count,
-                   int added_nodes_count, float scale) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names_slice, &original_nodes_num,
-              &current_nodes_num);
-
-  float quant_scale = 1.0f;
-  float dequant_scale = 1.0f;
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int transpose_nodes_count = 0;
-  int slice_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "transpose2") {
-        transpose_nodes_count++;
-      } else if (op->Type() == "slice") {
-        slice_nodes_count++;
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-        quant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale"));
-        EXPECT_EQ(quant_scale, scale) << "Scale for node '" + op->Type() + "'.";
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-        auto op_name = op->GetAttrIfExists<std::string>("name");
-        VLOG(3) << op_name << "\n";
-        if (op_name != "Dequantize1") {
-          dequant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale"));
-          EXPECT_EQ(dequant_scale, scale)
-              << "Scale for node '" + op->Type() + "'.";
-        }
-      }
-    }
-  }
-  EXPECT_EQ(transpose_nodes_count, transpose_count);
-  EXPECT_EQ(slice_nodes_count, slice_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
-TEST(CpuQuantizePass, slice) {
-  // a->Dequantize->b
-  // b2->Quant->b3->slice->c1->Dequant->c2
-  // c2->Dropout->d
-  int slice_count = 1;
-  int transpose_count = 0;
-  int quant_count = 1;
-  int dequant_count = 2;
-  // 1 Quant + 1 IN + 1 DeQuant + 1 OUT
-  int added_nodes_count = 4;
-  MainTestSlice(BuildProgramDescSlice(), transpose_count, slice_count,
-                quant_count, dequant_count, added_nodes_count, 2.0f * 127);
-}
-
-TEST(CpuQuantizePass, sliceBetweenNonQuantizedOp) {
-  // a->Transpos2->b
-  // b->slice->c
-  // c->Dropout->d
-  int slice_count = 1;
-  int transpose_count = 1;
-  int quant_count = 0;
-  int dequant_count = 0;
-  // 0 Quant + 0 IN + 0 DeQuant + 0 OUT
-  int added_nodes_count = 0;
-  MainTestSlice(BuildProgramDescSliceBetweenNonQuantizedOp(), transpose_count,
-                slice_count, quant_count, dequant_count, added_nodes_count,
-                2.0f * 127);
-}
-
-static const std::initializer_list<std::string> variable_names_matmul = {
-    "a", "b", "c", "d", "e", "f"};
-
-ProgramDesc BuildProgramDescMatmul() {
-  ProgramDesc prog;
-  for (auto& v : variable_names_matmul) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
-  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
-  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
-
-  return prog;
-}
-
-ProgramDesc BuildProgramDescMatmulNotQuantized() {
-  ProgramDesc prog;
-  for (auto& v : variable_names_matmul) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
-  SetOp(&prog, "dequantize", "Dequantize", {"c"}, {"d"}, true);
-  SetOp(&prog, "matmul", "Matmul", {"b", "d"}, {"e"}, true, "int8");
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
-
-  return prog;
-}
-
-void MainTestMatmul(const ProgramDesc& prog, int matmul_count, int quant_count,
-                    int dequant_count, int added_nodes_count, float scale) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names_matmul, &original_nodes_num,
-              &current_nodes_num);
-
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int matmul_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "matmul") {
-        matmul_nodes_count++;
-        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_x")), scale)
-            << "Scale_x for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_y")), scale)
-            << "Scale_y for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_out")), scale)
-            << "Scale_out for node '" + op_name + "'.";
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
-  }
-  EXPECT_EQ(matmul_nodes_count, matmul_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
-TEST(CpuQuantizePass, matmul) {
-  int matmul_count = 1;
-  int quant_count = 2;
-  int dequant_count = 3;
-  // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
-  int added_nodes_count = 6;
-  MainTestMatmul(BuildProgramDescMatmul(), matmul_count, quant_count,
-                 dequant_count, added_nodes_count, 2.0f * 127);
-}
-
-TEST(CpuQuantizePass, matmul_not_quantized) {
-  int matmul_count = 1;
-  int quant_count = 0;
-  int dequant_count = 1;
-  // nothing change
-  int added_nodes_count = 0;
-  MainTestMatmul(BuildProgramDescMatmulNotQuantized(), matmul_count,
-                 quant_count, dequant_count, added_nodes_count, 1.0f);
-}
-
-static const std::initializer_list<std::string> variable_names_elementwise_add =
-    {"a", "b", "c", "d", "e", "f"};
-
-ProgramDesc BuildProgramDescElementwiseAdd() {
-  ProgramDesc prog;
-  for (auto& v : variable_names_elementwise_add) {
-    prog.MutableBlock(0)->Var(v);
-  }
-  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
-  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
-  SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
-        "int8");
-  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32");
-
-  return prog;
-}
-
-void MainTestElementwiseAdd(const ProgramDesc& prog, int elementwise_add_count,
-                            int quant_count, int dequant_count,
-                            int added_nodes_count, float scale,
-                            bool output_scale_missing = false,
-                            bool unsigned_and_signed_input = false) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-  int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names_elementwise_add, &original_nodes_num,
-              &current_nodes_num, output_scale_missing ? "e" : "",
-              unsigned_and_signed_input ? "b" : "");
-
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int elementwise_add_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "elementwise_add") {
-        elementwise_add_nodes_count++;
-        if (unsigned_and_signed_input) scale = 1.0f;
-        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_x")), scale)
-            << "Scale_x for node '" + op_name + "'.";
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_y")), scale)
-            << "Scale_y for node '" + op_name + "'.";
-        if (output_scale_missing) scale = 1.0;
-        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_out")), scale)
-            << "Scale_out for node '" + op_name + "'.";
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
-  }
-  EXPECT_EQ(elementwise_add_nodes_count, elementwise_add_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
-TEST(CpuQuantizePass, elementwise_add) {
-  int elementwise_add_count = 1;
-  int quant_count = 2;
-  int dequant_count = 3;
-  // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
-  int added_nodes_count = 6;
-  MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(),
-                         elementwise_add_count, quant_count, dequant_count,
-                         added_nodes_count, 2.0f * 127);
-}
-
-TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
-  int elementwise_add_count = 1;
-  int quant_count = 0;
-  int dequant_count = 2;
-  int added_nodes_count = 0;
-  MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(),
-                         elementwise_add_count, quant_count, dequant_count,
-                         added_nodes_count, 1.f, true);
-}
-
-TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
-  int elementwise_add_count = 1;
-  int quant_count = 0;
-  int dequant_count = 2;
-  int added_nodes_count = 0;
-  MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(),
-                         elementwise_add_count, quant_count, dequant_count,
-                         added_nodes_count, 2.0f * 127, false, true);
-}
-
-}  // namespace
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 1a701e2ef0a7e..5f74b61ee86aa 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
+
 #include <unordered_set>
 
 namespace paddle {
@@ -23,15 +24,34 @@ class Graph;
 
 void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Marks operators which are to be quantized.";
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>(
+          {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc",
+           "matmul", "nearest_interp", "nearest_interp_v2", "pool2d",
+           "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm",
+           "multi_gru", "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
+
+  if (!op_types_list.empty()) {
+    // Verify that all user-specified operators can be quantized.
+    for (const auto& op : op_types_list) {
+      PADDLE_ENFORCE_NE(
+          supported_op_types.count(op), 0,
+          platform::errors::InvalidArgument(
+              "Pass attribute quantize_enabled_op_types contains operator %s "
+              "that is not supported by OneDNN quantization.",
+              op));
+    }
+    supported_op_types = op_types_list;
+  }
   Init(name_scope_, graph);
   GraphPatternDetector gpd;
   patterns::QuantizePlacement quantize_placement_pattern{gpd.mutable_pattern(),
                                                          "quantize_placement"};
-  quantize_placement_pattern(op_types_list);
+  quantize_placement_pattern(supported_op_types);
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -46,16 +66,7 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
 
-    if (op->Op()->HasAttr("mkldnn_data_type") ||
-        op->Op()->HasProtoAttr("mkldnn_data_type")) {
-      // use_quantizer is no longer used
-      // assign value for compatibility
-      if (op->Op()->GetAttrIfExists<bool>("use_quantizer")) {
-        op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
-      }
-      op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
-      op->Op()->SetAttr("use_quantizer", true);
-    }
+    op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
   };
   gpd(graph, handler);
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index daf913bf7d80d..350fad2c672d4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -140,6 +140,32 @@ TEST(QuantizerPlacementPass, default_attr_value) {
   DefaultAttrTest(5);
 }
 
+void EnabledOpTypesTest(
+    std::initializer_list<std::string> quantize_enabled_op_types,
+    std::string missing_op) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
+  pass->Set("quantize_enabled_op_types",
+            new std::unordered_set<std::string>(quantize_enabled_op_types));
+
+  try {
+    graph.reset(pass->Apply(graph.release()));
+  } catch (paddle::platform::EnforceNotMet& err) {
+    std::string ex_msg = err.what();
+    std::string expected_msg =
+        "Pass attribute quantize_enabled_op_types contains operator " +
+        missing_op + " that is not supported by OneDNN quantization.";
+    EXPECT_TRUE(ex_msg.find(expected_msg) != std::string::npos);
+  }
+}
+
+TEST(QuantizerPlacementPass, unsupported_op_type) {
+  // Dropout op is not supported by OneDNN quantization
+  EnabledOpTypesTest({"conv2d", "dropout"}, "dropout");
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
index d0962757185e2..96f575745a3a2 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -178,3 +179,8 @@ void ReshapeTransposeMatmulMkldnnFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(reshape_transpose_matmul_mkldnn_fuse_pass,
               paddle::framework::ir::ReshapeTransposeMatmulMkldnnFusePass);
+
+REGISTER_PASS_CAPABILITY(reshape_transpose_matmul_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "matmul", 1));
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index f4cca78b6da03..7e61d6ae4248b 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -122,6 +122,10 @@ class Node {
   // Please don't use this API!
   int id() const { return id_; }
 
+  // Only use this for auto parallel.
+  // A node does not have original desc if the return is zero.
+  uint64_t OriginalDescId() const { return original_desc_id_; }
+
   bool IsOp() const { return type_ == Type::kOperation; }
   bool IsVar() const { return type_ == Type::kVariable; }
   bool IsCtrlVar() const {
@@ -239,6 +243,10 @@ class Node {
   int desc_order_;
   int block_id_{-1};
 
+  // Store the original id of var desc or op desc.
+  // Only use this for auto parallel.
+  uint64_t original_desc_id_{0};
+
  private:
   // ID can only set by a Graph.
   void SetId(int id) { id_ = id; }
@@ -267,14 +275,16 @@ class Node {
         op_desc_(nullptr),
         type_(Type::kVariable),
         desc_order_(NO_DESC_ORDER),
-        block_id_(block_id) {}
+        block_id_(block_id),
+        original_desc_id_(var_desc->OriginalId()) {}
 
   explicit Node(OpDesc* op_desc)
       : name_(op_desc->Type()),
         var_desc_(nullptr),
         op_desc_(new OpDesc(*op_desc, op_desc->Block())),
         type_(Type::kOperation),
-        desc_order_(NO_DESC_ORDER) {}
+        desc_order_(NO_DESC_ORDER),
+        original_desc_id_(op_desc->OriginalId()) {}
 
   Node() = delete;
 
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 7dee0f44e384d..dff6d0e01839a 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -144,8 +144,8 @@ class LoDTensor : public Tensor {
    */
   size_t NumLevels() const { return lod_.size(); }
   /*
-   * Number of elements in a level.
-   */
+ * Number of elements in a level.
+ */
   size_t NumElements(size_t level = 0) const {
     PADDLE_ENFORCE_LT(
         level, NumLevels(),
diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc
new file mode 100644
index 0000000000000..b15a66c51c4b6
--- /dev/null
+++ b/paddle/fluid/framework/mixed_vector.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/mixed_vector.h"
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/details/cow_ptr.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+void CopyToCPUHelper(std::vector<T> *cpu_, paddle::memory::AllocationPtr *gpu_,
+                     size_t *gpu_memory_size_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // COPY GPU Data To CPU
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
+  auto stream = dev_ctx->stream();
+  void *src = (*gpu_)->ptr();
+  void *dst = cpu_->data();
+  paddle::memory::Copy(platform::CPUPlace(), dst,
+                       OptionalCUDAPlace(*gpu_).get(), src, *gpu_memory_size_,
+                       stream);
+  dev_ctx->Wait();
+#endif
+}
+
+template <typename T>
+void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
+                             paddle::memory::AllocationPtr *gpu_,
+                             size_t *gpu_memory_size_,
+                             const platform::Place &place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void *src = cpu_->data();
+  *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
+  (*gpu_) = memory::Alloc(place, *gpu_memory_size_);
+  void *dst = (*gpu_)->ptr();
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place));
+  auto stream = dev_ctx->stream();
+  paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst,
+                       platform::CPUPlace(), src, *gpu_memory_size_, stream);
+#endif
+}
+
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                  \
+  template <>                                                                  \
+  void Vector<__TYPE__>::VectorData::CopyToCPU() const {                       \
+    CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                            \
+                                                                               \
+  template <>                                                                  \
+  void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                        \
+      const platform::Place &place) const {                                    \
+    CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \
+  }
+
+INSTANTIATE_VECTOR_FOR_TYPE(size_t)
+INSTANTIATE_VECTOR_FOR_TYPE(int)
+INSTANTIATE_VECTOR_FOR_TYPE(int64_t)
+
+};  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index cf71cdfc6d651..d1aee6cb2f662 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -23,17 +23,21 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/fluid/framework/details/cow_ptr.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"
 
 namespace paddle {
 namespace framework {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
+    const paddle::memory::allocation::AllocationPtr &gpu_) {
+  return gpu_ == nullptr
+             ? paddle::none
+             : paddle::optional<platform::CUDAPlace>(
+                   BOOST_GET_CONST(platform::CUDAPlace, gpu_->place()));
+}
+
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -198,10 +202,7 @@ class Vector {
     std::mutex &Mutex() const { return mtx_; }
 
     paddle::optional<platform::CUDAPlace> CUDAPlace() const {
-      return gpu_ == nullptr
-                 ? paddle::none
-                 : paddle::optional<platform::CUDAPlace>(
-                       BOOST_GET_CONST(platform::CUDAPlace, gpu_->place()));
+      return OptionalCUDAPlace(gpu_);
     }
 
    private:
@@ -212,17 +213,7 @@ class Vector {
       kDirty = 0x10
     };
 
-    void CopyToCPU() const {
-      // COPY GPU Data To CPU
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(gpu_->place()));
-      auto stream = dev_ctx->stream();
-      void *src = gpu_->ptr();
-      void *dst = cpu_.data();
-      paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src,
-                           gpu_memory_size_, stream);
-      dev_ctx->Wait();
-    }
+    void CopyToCPU() const;
 
     void MutableCPU() {
       if (IsInCUDA() && IsDirty()) {
@@ -260,17 +251,7 @@ class Vector {
       }
     }
 
-    void CopyCPUDataToCUDA(const platform::Place &place) const {
-      void *src = cpu_.data();
-      gpu_memory_size_ = cpu_.size() * sizeof(T);
-      gpu_ = memory::Alloc(place, gpu_memory_size_);
-      void *dst = gpu_->ptr();
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(place));
-      auto stream = dev_ctx->stream();
-      paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src,
-                           gpu_memory_size_, stream);
-    }
+    void CopyCPUDataToCUDA(const platform::Place &place) const;
 
     void ImmutableCPU() const {
       if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
@@ -291,7 +272,7 @@ class Vector {
     bool IsInCPU() const { return flag_ & kDataInCPU; }
 
     mutable std::vector<T> cpu_;
-    mutable paddle::memory::AllocationPtr gpu_;
+    mutable paddle::memory::allocation::AllocationPtr gpu_;
     mutable size_t gpu_memory_size_{0};
     mutable int flag_;
 
@@ -465,81 +446,5 @@ class Vector {
   mutable details::COWPtr<VectorData> m_;
 };
 
-#else  // PADDLE_WITH_CUDA
-
-template <typename T>
-class CPUVector : public std::vector<T, std::allocator<T>> {
- public:
-  CPUVector() : std::vector<T>() {}
-  CPUVector(size_t count, const T &value = T())  // NOLINT
-      : std::vector<T>(count, value) {}
-  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
-  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}  // NOLINT
-  CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
-  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
-  CPUVector(std::vector<T> &&other)  // NOLINT
-      : std::vector<T>(std::move(other)) {}
-  CPUVector &operator=(const CPUVector &other) {
-    this->assign(other.begin(), other.end());
-    return *this;
-  }
-  CPUVector &operator=(const std::vector<T> &other) {
-    this->assign(other.begin(), other.end());
-    return *this;
-  }
-
-  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
-    std::stringstream ss;
-    for (auto v : other) {
-      os << v << " ";
-    }
-    return os;
-  }
-
-  T &operator[](size_t id) { return this->at(id); }
-
-  const T &operator[](size_t id) const { return this->at(id); }
-
-  template <typename D>
-  void Extend(const D &begin, const D &end) {
-    this->reserve(this->size() + size_t(end - begin));
-    this->insert(this->end(), begin, end);
-  }
-
-  const T *CUDAData(platform::Place place) const {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Vector::CUDAData() method is not supported in CPU-only version."));
-  }
-
-  T *CUDAMutableData(platform::Place place) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Vector::CUDAMutableData() method is not supported in CPU-only "
-        "version."));
-  }
-
-  const T *Data(platform::Place place) const {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(place), true,
-        platform::errors::Unavailable(
-            "Vector::Data() method is not supported when not in CPUPlace."));
-    return this->data();
-  }
-
-  T *MutableData(platform::Place place) {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(place), true,
-        platform::errors::Unavailable("Vector::MutableData() method is not "
-                                      "supported when not in CPUPlace."));
-    return this->data();
-  }
-
-  const void *Handle() const { return static_cast<const void *>(this); }
-};
-
-template <typename T>
-using Vector = CPUVector<T>;
-
-#endif  // PADDLE_WITH_CUDA
-
 };  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 10e7ed0fb6021..011e2729d4adf 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -25,6 +25,7 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device_context.h"
 
 template <typename T>
 using vec = paddle::framework::Vector<T>;
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 7d55d8c41e3e9..9bd6aba3ea842 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -20,6 +20,9 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#if PADDLE_WITH_TENSORRT
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -132,5 +135,38 @@ NaiveExecutor::~NaiveExecutor() {
 #endif
 }
 
+void NaiveExecutor::ResetTrtOps(int num) {
+#if PADDLE_WITH_TENSORRT
+  for (auto &op : ops_) {
+    if (op->Type() == "tensorrt_engine") {
+      operators::TensorRTEngineOp *trtop =
+          dynamic_cast<operators::TensorRTEngineOp *>(op.get());
+      if (!trtop) return;
+      std::string engine_key = trtop->Attr<std::string>("engine_key");
+      int engine_predictor_id = trtop->Attr<int>("predictor_id");
+      std::string engine_name =
+          engine_key + std::to_string(engine_predictor_id);
+      operators::TensorRTEngine *trt_engine =
+          paddle::inference::Singleton<
+              inference::tensorrt::TRTEngineManager>::Global()
+              .Get(engine_name);
+      if (trt_engine->with_dynamic_shape()) {
+        LOG(INFO) << "rebuild trt engine, this may cost a lot of time!";
+        trt_engine->ResetContext();
+        trt_engine->ClearTensorMap();
+        trt_engine->SetProfileNum(num);
+        auto *anc = scope_->parent();
+        while (anc && anc->parent()) {
+          anc = anc->parent();
+        }
+        if (anc == nullptr) {
+          anc = scope_;
+        }
+        trtop->PrepareTRTEngine(*anc, trt_engine);
+      }
+    }
+  }
+#endif
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index f38632a9a639c..ed475e66f626d 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -63,6 +63,8 @@ class NaiveExecutor {
 
   void CleanFeedFetchOps();
 
+  void ResetTrtOps(int num);
+
  protected:
   void CreateOps(const ProgramDesc& desc, int block_id,
                  bool with_feed_fetch_ops);
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index e21588da7fdd8..e268bce87acf1 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -1,18 +1,34 @@
 set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog 
 lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-graph_to_program_pass variable_helper timer monitor nan_inf_utils)
+graph_to_program_pass variable_helper timer monitor nan_inf_utils interpretercore_event_garbage_collector)
+
+if(WITH_GPU)
+list(APPEND INTERPRETERCORE_DEPS interpretercore_fast_garbage_collector)
+endif()
+
+add_subdirectory(workqueue)
 
 cc_library(data_transfer SRCS data_transfer.cc DEPS enforce scope glog)
-cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
 cc_library(new_executor_defs SRCS new_executor_defs.cc DEPS enforce glog scope)
-cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS} executor_gc_helper)
+cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS garbage_collector)
+cc_library(interpretercore_event_garbage_collector SRCS interpretercore_event_garbage_collector.cc DEPS interpretercore_garbage_collector)
 cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer)
 cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
 cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
-cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager)
+cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_event_garbage_collector stream_analyzer event_manager)
 cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
-cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
 
+if(WITH_GPU OR WITH_ROCM)
+    if(WITH_GPU)
+        nv_library(interpretercore_fast_garbage_collector SRCS interpretercore_fast_garbage_collector.cc DEPS interpretercore_garbage_collector)
+    elseif(WITH_ROCM)
+        hip_library(interpretercore_fast_garbage_collector SRCS interpretercore_fast_garbage_collector.cc DEPS interpretercore_garbage_collector)
+    endif()
+    
+    target_link_libraries(interpretercore interpretercore_fast_garbage_collector)
+endif()
+
+# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 # skip win32 since wget is not installed by default on windows machine.
 # skip COVERAGE_CI since the test runs slowly because of instrumentation.
 if (WITH_TESTING AND NOT WIN32 AND NOT WITH_COVERAGE AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 064dfa0170bdb..9230c36a0c745 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -94,8 +94,7 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
 
   // 2. Execute infer shape and choose kernel
   auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
-  static_cast<const framework::OperatorWithKernel*>(op.get())->InferShape(
-      &infer_shape_ctx);
+  op.get()->Info().infer_shape_(&infer_shape_ctx);
   auto kernels_iter = all_op_kernels.find(op_type);
   PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
                     platform::errors::Unavailable(
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index dcbdd12f88fb7..950756c0394a5 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -16,10 +16,15 @@
 #include <unordered_set>
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+#include "paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.h"
+#endif
+
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
                             "Use inplace in new executor");
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true,
@@ -28,14 +33,21 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true,
 
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(benchmark);
+DECLARE_bool(fast_eager_deletion_mode);
+DECLARE_bool(use_stream_safe_cuda_allocator);
 
 constexpr const char* kExceptionCaught = "ExceptionCaught";
+constexpr const char* kTaskCompletion = "TaskCompletion";
 
 namespace paddle {
 namespace framework {
 // NOTE(Aurelius84): Need a better strategy to determine it.
 static constexpr size_t kHostNumThreads = 4;
 
+bool IsInterpretercoreFastGCEnabled() {
+  return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator;
+}
+
 InterpreterCore::InterpreterCore(const platform::Place& place,
                                  const BlockDesc& block,
                                  VariableScope* global_scope)
@@ -46,10 +58,19 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
   is_build_ = false;
   async_work_queue_.reset(
       new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_));
-  gc_.reset(new InterpreterCoreGarbageCollector());
 
-  exception_notifier_ = main_thread_blocker_.RegisterEvent(
-      kExceptionCaught, [this]() { return exception_holder_.IsCaught(); });
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (IsInterpretercoreFastGCEnabled()) {
+    gc_ = std::make_unique<InterpreterCoreFastGarbageCollector>();
+  } else {
+    gc_ = std::make_unique<InterpreterCoreEventGarbageCollector>();
+  }
+#else
+  gc_ = std::make_unique<InterpreterCoreEventGarbageCollector>();
+#endif
+
+  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
+  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
 
   create_local_scope_ = FLAGS_new_executor_use_local_scope;
   if (FLAGS_new_executor_use_local_scope) {
@@ -70,6 +91,9 @@ InterpreterCore::~InterpreterCore() {
   // cancle gc's thread
   gc_.reset(nullptr);
 
+  exception_notifier_->UnregisterEvent();
+  completion_notifier_->UnregisterEvent();
+
   async_work_queue_.reset(nullptr);
 }
 
@@ -389,7 +413,23 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
     if (op_with_kernel == nullptr) {
       instr_node.OpBase()->Run(*local_scope, place_);
     } else {
-      instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
+      // fit for pten
+      if (instr_node.PtenKernel() && instr_node.PtenKernel()->IsValid()) {
+        VLOG(4) << "Run pten kernel: " << op->Type();
+        VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
+                << &instr_node.DeviceContext();
+        op_with_kernel->BuildPtenKernelContext(
+            *instr_node.InnerRuntimeContext().get(),
+            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()));
+
+        (*instr_node.PtenKernel())(instr_node.PtenKernelContext());
+
+        op_with_kernel->WriteBackToOutputs(
+            instr_node.InnerRuntimeContext().get());
+        instr_node.PtenKernelContext()->ClearData();
+      } else {
+        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
+      }
     }
   }
 
@@ -418,7 +458,7 @@ void InterpreterCore::ExecuteInstructionList(
     const std::vector<Instruction>& vec_instr) {
   async_work_queue_->PrepareAtomicDeps(dependecy_count_);
   async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
-  op_run_number_ = 0;
+  unfinished_op_numer_ = vec_instr.size();
 
   exception_holder_.Clear();
 
@@ -430,19 +470,22 @@ void InterpreterCore::ExecuteInstructionList(
   }
 
   auto event_name = main_thread_blocker_.WaitEvent();
-  VLOG(3) << "event_name: " << event_name;
+  VLOG(1) << "event_name: " << event_name;
 
   if (UNLIKELY(exception_holder_.IsCaught())) {
-    VLOG(4) << "Exception caught " << exception_holder_.Type();
+    VLOG(1) << "Exception caught " << exception_holder_.Type();
+    // NOTE(xiongkun) Why we reset ?
+    // The caught exception may be EOFExcetion, under this situation, we need
+    // make async_work_queue_ available, so we need reset.
     async_work_queue_->Cancel();
+    async_work_queue_.reset(new interpreter::AsyncWorkQueue(
+        kHostNumThreads, &main_thread_blocker_));
+    PADDLE_ENFORCE_EQ(
+        main_thread_blocker_.Clear(), 0,
+        platform::errors::PreconditionNotMet(
+            "main_thread_blocker_.Clear() return -1, clear failed"));
     exception_holder_.ReThrow();
   }
-
-  PADDLE_ENFORCE_EQ(
-      op_run_number_.load(), vec_instr.size(),
-      platform::errors::Fatal(
-          "Required op_run_number == %d, but received op_run_number = %d.",
-          vec_instr.size(), op_run_number_.load()));
 }
 
 void InterpreterCore::RunNextInstructions(
@@ -515,7 +558,10 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
 
     try {
       RunInstruction(instr_node);
-      // GC infomation
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      RecordStreamForGC(instr_node);
+#endif
       CheckGC(instr_node);
     } catch (platform::EnforceNotMet& ex) {
       framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
@@ -540,13 +586,113 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
       return;
     }
 
+    VLOG(4) << "unfinished_op_numer_: " << unfinished_op_numer_;
+    if (UNLIKELY(unfinished_op_numer_.fetch_sub(1, std::memory_order_relaxed) ==
+                 1)) {
+      if (completion_notifier_ != nullptr) {
+        completion_notifier_->NotifyEvent();
+      }
+    }
+
     interpreter::RecordEvent(instr_node, place_);
-    op_run_number_.fetch_add(1, std::memory_order_relaxed);
 
     RunNextInstructions(instr_node, &ready_ops);
   }
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
+  if (!IsInterpretercoreFastGCEnabled() ||
+      instr.KernelType() != OpFuncType::kQueueAsync) {
+    return;
+  }
+
+  gpuStream_t stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                           instr.DeviceContext())
+                           .stream();
+  auto TensorRecordStream = [&stream](Tensor& tensor) {
+    auto allocation = tensor.Holder();
+    if (allocation == nullptr) {
+      return;
+    }
+
+    const platform::Place& place = allocation->place();
+    if (platform::is_gpu_place(place)) {
+      memory::RecordStream(allocation, stream);
+    } else if (platform::is_cuda_pinned_place(place)) {
+      // TODO(Ruibiao): Here should do something to make sure that the tensor is
+      // not freed until the H2D copies done. However, simplely launch a CUDA
+      // runtime callback to the H2D stream may lead a high performance
+      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
+      // present, we just log a WARNING here. A better design is required.
+      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
+                      "manner may lead a data inconsistent";
+    } else {
+      // memory copies involve CPUPlace are always synchronous, so just do
+      // nothing here
+    }
+  };
+
+  /* NOTE(Ruibiao)：Cross-stream tensor synchronization is required only when
+   * all the following conditions are satisfied:
+   * 1. The tensor will be GC after running the instruction, i.e., in
+   * instr.GCCheckVars.
+   * 2. The stream which initializes this tensor is different from the stream
+   * which the instruction run in.
+   * 3. The tensor is the instruction's input, cause we assume that instruction
+   * will initialize all output tensors with its running stream.
+   * 4. In the OP function of this instruction, the tensor is an input of a
+   * async CUDA kernel.
+   *
+   * Here we only process the first condition, because:
+   * 1. Since the RecordStream function will directly return when the recored
+   * stream is equal to the owning stream, recording a stream same as which
+   * initialized this tensor has less time overhead. Conversely, it may take
+   * more time if we try to extract those cross-stream input vars from
+   * instr.GCCheckVars.
+   * 2. Now the instruction has no idea of which vars involving async running in
+   * OP function, and thus we can not recognize condition 4. It should be
+   * supported later.
+   */
+  for (int var_id : instr.GCCheckVars()) {
+    VLOG(4) << "GC sync " << global_scope_->GetNameById(var_id) << " "
+            << global_scope_->VarDesc(var_id);
+
+    // persistable var will be ignore while GC
+    if (global_scope_->VarDesc(var_id) &&
+        global_scope_->VarDesc(var_id)->Persistable()) {
+      continue;
+    }
+
+    paddle::framework::Variable* var = global_scope_->Var(var_id);
+    if (var == nullptr) {
+      continue;
+    }
+
+    if (var->IsType<LoDTensor>()) {
+      TensorRecordStream(*(var->GetMutable<LoDTensor>()));
+    } else if (var->IsType<
+                   operators::reader::
+                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+      // do nothing
+    } else if (var->IsType<SelectedRows>()) {
+      TensorRecordStream(*(var->GetMutable<SelectedRows>()->mutable_value()));
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto& tensor : *tensor_arr) {
+        TensorRecordStream(tensor);
+      }
+    } else if (var->IsType<std::vector<Scope*>>()) {
+      // do nothing
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The variable(%s) is not supported in eager deletion.",
+          framework::ToTypeName(var->Type())));
+    }
+  }
+}
+#endif
+
 void InterpreterCore::CheckGC(const Instruction& instr) {
   size_t instr_id = instr.Id();
   auto& var_scope = *global_scope_;
@@ -565,8 +711,21 @@ void InterpreterCore::CheckGC(const Instruction& instr) {
     if (is_ready) {
       VLOG(6) << "Async delete variable with name : "
               << var_scope.GetNameById(var_id);
-      gc_->Add(var_scope.Var(var_id), gc_event_.at(instr_id),
-               &instr.DeviceContext());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      if (IsInterpretercoreFastGCEnabled()) {
+        static_cast<InterpreterCoreFastGarbageCollector*>(gc_.get())->Add(
+            var_scope.Var(var_id));
+
+      } else {
+        static_cast<InterpreterCoreEventGarbageCollector*>(gc_.get())->Add(
+            var_scope.Var(var_id), gc_event_.at(instr_id),
+            &instr.DeviceContext());
+      }
+#else
+      static_cast<InterpreterCoreEventGarbageCollector*>(gc_.get())->Add(
+          var_scope.Var(var_id), gc_event_.at(instr_id),
+          &instr.DeviceContext());
+#endif
     }
   }
 }
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 656262d6381f6..277093c082fd9 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -26,8 +26,6 @@
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/framework/new_executor/profiler.h"
 #include "paddle/fluid/framework/new_executor/stream_analyzer.h"
-#include "paddle/fluid/framework/new_executor/workqueue.h"
-#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
@@ -74,6 +72,10 @@ class InterpreterCore {
                const std::vector<framework::LoDTensor>& feed_tensors,
                bool prepare_feed);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void RecordStreamForGC(const Instruction& instr);
+#endif
+
   void CheckGC(const Instruction& instr);
 
   void RunInstructionAsync(size_t instr_id);
@@ -103,7 +105,7 @@ class InterpreterCore {
   std::vector<Instruction> vec_instruction_;  // deconstruct before OpFuncNode
 
   std::vector<size_t> dependecy_count_;
-  std::atomic<size_t> op_run_number_{0};
+  std::atomic<size_t> unfinished_op_numer_{0};
   std::vector<std::vector<size_t>> input_var2op_info_;
 
   StreamAnalyzer stream_analyzer_;
@@ -111,6 +113,7 @@ class InterpreterCore {
   std::unique_ptr<interpreter::AsyncWorkQueue> async_work_queue_;
   details::ExceptionHolder exception_holder_;
   std::shared_ptr<EventsWaiter::EventNotifier> exception_notifier_{nullptr};
+  std::shared_ptr<EventsWaiter::EventNotifier> completion_notifier_{nullptr};
 
   std::unique_ptr<InterpreterCoreGarbageCollector> gc_;
   std::vector<paddle::platform::DeviceEvent> gc_event_;
diff --git a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
new file mode 100644
index 0000000000000..7beefec4487de
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.h"
+
+#if !defined(_WIN32)
+#include <sched.h>
+#else
+#define NOMINMAX
+#include <windows.h>
+#endif  // !_WIN32
+
+namespace paddle {
+namespace framework {
+
+InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector() {
+  WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true,
+                           /*track_task*/ false);
+  queue_ = CreateSingleThreadedWorkQueue(options);
+}
+
+InterpreterCoreEventGarbageCollector::~InterpreterCoreEventGarbageCollector() {
+  queue_.reset(nullptr);
+}
+
+void InterpreterCoreEventGarbageCollector::Add(
+    Garbage garbage, platform::DeviceEvent& event,
+    const platform::DeviceContext* ctx) {
+  if (!garbage) {
+    return;
+  }
+
+  if (max_memory_size_ <= 1) {
+    Free(garbage, event, ctx);
+  } else {
+    std::unique_ptr<GarbageQueue> pending_delete_garbages;
+    {  // lock guard
+      std::lock_guard<memory::SpinLock> guard(spinlock_);
+      cur_memory_size_ += garbage->size();
+      garbages_->push_back(std::move(garbage));
+
+      if (cur_memory_size_ >= max_memory_size_) {
+        cur_memory_size_ = 0;
+        pending_delete_garbages = std::move(garbages_);
+        garbages_ = std::make_unique<GarbageQueue>();
+      }
+    }
+  }
+}
+
+void InterpreterCoreEventGarbageCollector::Add(
+    Variable* var, platform::DeviceEvent& event,
+    const platform::DeviceContext* ctx) {
+  if (UNLIKELY(max_memory_size_ < 0) || var == nullptr) {
+    return;
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    Add(var->GetMutable<LoDTensor>()->MoveMemoryHolder(), event, ctx);
+  } else if (var->IsType<
+                 operators::reader::
+                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+    // TODO(xiongkun03) in old executor, this type of variable is not support
+    // eager deletion. so we just leave it here ?
+  } else if (var->IsType<LoDRankTable>()) {
+    // TODO(xiongkun03) in old executor, this type of variable is not support
+    // eager deletion. so we just leave it here ?
+  } else if (var->IsType<SelectedRows>()) {
+    Add(var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder(),
+        event, ctx);
+    var->GetMutable<SelectedRows>()->mutable_rows()->clear();
+  } else if (var->IsType<LoDTensorArray>()) {
+    auto* tensor_arr = var->GetMutable<LoDTensorArray>();
+    for (auto& t : *tensor_arr) {
+      Add(t.MoveMemoryHolder(), event, ctx);
+    }
+  } else if (var->IsType<std::vector<Scope*>>()) {
+    // NOTE(@xiongkun03) conditional_op / while_op will create a STEP_SCOPE
+    // refer to executor.cc to see what old garbage collector does.
+    // do nothing, because the sub scope will be deleted by sub-executor.
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "The variable(%s) is not supported in eager deletion.",
+        framework::ToTypeName(var->Type())));
+  }
+}
+
+void InterpreterCoreEventGarbageCollector::Free(
+    GarbageQueue* garbages, platform::DeviceEvent& event,
+    const platform::DeviceContext* ctx) {
+  event.Record(ctx);
+  event.SetFininshed();  // Only for CPU Event
+  queue_->AddTask([ container = garbages, event = &event ]() {
+    while (!event->Query()) {
+#if defined(_WIN32)
+      SleepEx(50, FALSE);
+#else
+      sched_yield();
+#endif
+      continue;
+    }
+    delete container;
+  });
+}
+
+void InterpreterCoreEventGarbageCollector::Free(
+    Garbage& garbage, platform::DeviceEvent& event,
+    const platform::DeviceContext* ctx) {
+  event.Record(ctx);
+  event.SetFininshed();  // Only for CPU Event
+  queue_->AddTask([ container = garbage, event = &event ]() {
+    while (!event->Query()) {
+#if defined(_WIN32)
+      SleepEx(50, FALSE);
+#else
+      sched_yield();
+#endif
+      continue;
+    }
+  });
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.h b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.h
new file mode 100644
index 0000000000000..ab329f196da34
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <queue>
+#include "paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
+
+namespace paddle {
+namespace framework {
+
+class InterpreterCoreEventGarbageCollector
+    : public InterpreterCoreGarbageCollector {
+ public:
+  InterpreterCoreEventGarbageCollector();
+  ~InterpreterCoreEventGarbageCollector();
+
+  virtual void Add(Variable* var, platform::DeviceEvent& event,
+                   const platform::DeviceContext* ctx) override;
+
+ private:
+  void Add(Garbage garbage, platform::DeviceEvent& event,
+           const platform::DeviceContext* ctx);
+  void Free(GarbageQueue* garbages, platform::DeviceEvent& event,
+            const platform::DeviceContext* ctx);
+  void Free(Garbage& garbage, platform::DeviceEvent& event,
+            const platform::DeviceContext* ctx);
+
+  std::unique_ptr<WorkQueue> queue_;
+  paddle::memory::SpinLock spinlock_;
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc
new file mode 100644
index 0000000000000..784cfca943ea1
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+
+void InterpreterCoreFastGarbageCollector::Add(Variable* var) {
+  if (UNLIKELY(max_memory_size_ < 0) || var == nullptr) {
+    return;
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    Add(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+  } else if (var->IsType<
+                 operators::reader::
+                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+    // TODO(xiongkun03) in old executor, this type of variable is not support
+    // eager deletion. so we just leave it here ?
+  } else if (var->IsType<LoDRankTable>()) {
+    // TODO(xiongkun03) in old executor, this type of variable is not support
+    // eager deletion. so we just leave it here ?
+  } else if (var->IsType<SelectedRows>()) {
+    Add(var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    var->GetMutable<SelectedRows>()->mutable_rows()->clear();
+  } else if (var->IsType<LoDTensorArray>()) {
+    auto* tensor_arr = var->GetMutable<LoDTensorArray>();
+    for (auto& t : *tensor_arr) {
+      Add(t.MoveMemoryHolder());
+    }
+  } else if (var->IsType<std::vector<Scope*>>()) {
+    // NOTE(@xiongkun03) conditional_op / while_op will create a STEP_SCOPE
+    // refer to executor.cc to see what old garbage collector does.
+    // do nothing, because the sub scope will be deleted by sub-executor.
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "The variable(%s) is not supported in eager deletion.",
+        framework::ToTypeName(var->Type())));
+  }
+}
+
+void InterpreterCoreFastGarbageCollector::Add(Garbage garbage) {
+  if (!garbage) {
+    return;
+  }
+
+  if (max_memory_size_ > 1) {
+    std::unique_ptr<GarbageQueue> pending_delete_garbages;
+    {  // lock guard
+      std::lock_guard<memory::SpinLock> guard(spinlock_);
+      cur_memory_size_ += garbage->size();
+      garbages_->push_back(std::move(garbage));
+
+      if (cur_memory_size_ >= max_memory_size_) {
+        cur_memory_size_ = 0;
+        pending_delete_garbages = std::move(garbages_);
+        garbages_ = std::make_unique<GarbageQueue>();
+      }
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.h b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.h
new file mode 100644
index 0000000000000..ad19db049468f
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+
+class InterpreterCoreFastGarbageCollector
+    : public InterpreterCoreGarbageCollector {
+ public:
+  virtual void Add(Variable* var) override;
+
+ private:
+  void Add(Garbage garbage);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
index 40537815b48bf..9345546e65f99 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.cc
@@ -19,114 +19,23 @@ namespace paddle {
 namespace framework {
 
 InterpreterCoreGarbageCollector::InterpreterCoreGarbageCollector() {
-  garbages_.reset(new GarbageQueue());
-  max_memory_size_ = static_cast<size_t>(GetEagerDeletionThreshold());
+  garbages_ = std::make_unique<GarbageQueue>();
+  max_memory_size_ = static_cast<int64_t>(GetEagerDeletionThreshold());
   cur_memory_size_ = 0;
-
-  WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true,
-                           /*track_task*/ false);
-  queue_ = CreateSingleThreadedWorkQueue(options);
 }
 
-InterpreterCoreGarbageCollector::~InterpreterCoreGarbageCollector() {
-  queue_.reset(nullptr);
+void InterpreterCoreGarbageCollector::Add(Variable* var) {
+  PADDLE_THROW(
+      platform::errors::Unimplemented("Not allowed to call the member function "
+                                      "of InterpreterCoreGarbageCollector"));
 }
 
-void InterpreterCoreGarbageCollector::Add(
-    std::shared_ptr<memory::Allocation> garbage,
-    paddle::platform::DeviceEvent& event, const platform::DeviceContext* ctx) {
-  if (max_memory_size_ <= 1) {
-    Free(garbage, event, ctx);
-  } else {
-    if (!garbage) return;
-    GarbageQueue* garbage_ptr = nullptr;
-    {
-      std::lock_guard<paddle::memory::SpinLock> guard(spinlock_);
-      cur_memory_size_ += garbage->size();
-      garbages_->push_back(std::move(garbage));
-
-      if (cur_memory_size_ >= max_memory_size_) {
-        cur_memory_size_ = 0;
-        garbage_ptr = garbages_.release();
-        garbages_.reset(new GarbageQueue());
-      }
-    }
-    if (garbage_ptr) {
-      Free(garbage_ptr, event, ctx);
-    }
-  }
-}
-
-void InterpreterCoreGarbageCollector::Add(paddle::framework::Variable* var,
-                                          paddle::platform::DeviceEvent& event,
+void InterpreterCoreGarbageCollector::Add(Variable* var,
+                                          platform::DeviceEvent& event,
                                           const platform::DeviceContext* ctx) {
-  if (!var) {
-    return;
-  }
-
-  if (var->IsType<LoDTensor>()) {
-    Add(var->GetMutable<LoDTensor>()->MoveMemoryHolder(), event, ctx);
-  } else if (var->IsType<
-                 operators::reader::
-                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
-    // TODO(xiongkun03) in old executor, this type of variable is not support
-    // eager deletion. so we just leave it here ?
-  } else if (var->IsType<LoDRankTable>()) {
-    // TODO(xiongkun03) in old executor, this type of variable is not support
-    // eager deletion. so we just leave it here ?
-  } else if (var->IsType<SelectedRows>()) {
-    Add(var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder(),
-        event, ctx);
-    var->GetMutable<SelectedRows>()->mutable_rows()->clear();
-  } else if (var->IsType<LoDTensorArray>()) {
-    auto* tensor_arr = var->GetMutable<LoDTensorArray>();
-    for (auto& t : *tensor_arr) {
-      Add(t.MoveMemoryHolder(), event, ctx);
-    }
-  } else if (var->IsType<std::vector<Scope*>>()) {
-    // NOTE(@xiongkun03) conditional_op / while_op will create a STEP_SCOPE
-    // refer to executor.cc to see what old garbage collector does.
-    // do nothing, because the sub scope will be deleted by sub-executor.
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "The variable(%s) is not supported in eager deletion.",
-        framework::ToTypeName(var->Type())));
-  }
-}
-
-void InterpreterCoreGarbageCollector::Free(GarbageQueue* garbages,
-                                           paddle::platform::DeviceEvent& event,
-                                           const platform::DeviceContext* ctx) {
-  event.Record(ctx);
-  event.SetFininshed();  // Only for CPU Event
-  queue_->AddTask([ container = garbages, event = &event ]() {
-    while (!event->Query()) {
-#if defined(_WIN32)
-      SleepEx(50, FALSE);
-#else
-      sched_yield();
-#endif
-      continue;
-    }
-    delete container;
-  });
-}
-
-void InterpreterCoreGarbageCollector::Free(
-    std::shared_ptr<memory::Allocation>& garbage,
-    paddle::platform::DeviceEvent& event, const platform::DeviceContext* ctx) {
-  event.Record(ctx);
-  event.SetFininshed();  // Only for CPU Event
-  queue_->AddTask([ container = garbage, event = &event ]() {
-    while (!event->Query()) {
-#if defined(_WIN32)
-      SleepEx(50, FALSE);
-#else
-      sched_yield();
-#endif
-      continue;
-    }
-  });
+  PADDLE_THROW(
+      platform::errors::Unimplemented("Not allowed to call the member function "
+                                      "of InterpreterCoreGarbageCollector"));
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h
index 166139a73c8f9..5a0554d577aff 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h
@@ -13,54 +13,31 @@
 // limitations under the License.
 #pragma once
 
-#if !defined(_WIN32)
-#include <sched.h>
-#else
-#define NOMINMAX
-#include <windows.h>
-#endif  // !_WIN32
-
 #include <queue>
-#include <vector>
-
-#include "paddle/fluid/framework/new_executor/workqueue.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/device_event.h"
 
 namespace paddle {
 namespace framework {
 
-using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;
+using Garbage = std::shared_ptr<memory::Allocation>;
+using GarbageQueue = std::deque<Garbage>;
+
 class InterpreterCoreGarbageCollector {
  public:
   InterpreterCoreGarbageCollector();
-
-  ~InterpreterCoreGarbageCollector();
-
-  void Add(std::shared_ptr<memory::Allocation> garbage,  // NOLINT
-           paddle::platform::DeviceEvent& event,         // NOLINT
-           const platform::DeviceContext* ctx);
-
-  void Add(paddle::framework::Variable* var,
-           paddle::platform::DeviceEvent& event,  // NOLINT
-           const platform::DeviceContext* ctx);
-
+  virtual ~InterpreterCoreGarbageCollector(){};
+  virtual void Add(Variable* var);
+  virtual void Add(Variable* var, platform::DeviceEvent& event,
+                   const platform::DeviceContext* ctx);
   DISABLE_COPY_AND_ASSIGN(InterpreterCoreGarbageCollector);
 
- private:
-  void Free(GarbageQueue* garbages,
-            paddle::platform::DeviceEvent& event,  // NOLINT
-            const platform::DeviceContext* ctx);
-
-  void Free(std::shared_ptr<memory::Allocation>& garbage,  // NOLINT
-            paddle::platform::DeviceEvent& event,          // NOLINT
-            const platform::DeviceContext* ctx);
-
+ protected:
   std::unique_ptr<GarbageQueue> garbages_;
-  size_t max_memory_size_;
-  size_t cur_memory_size_;
-  std::unique_ptr<WorkQueue> queue_;
-  paddle::memory::SpinLock spinlock_;
+  int64_t max_memory_size_;
+  int64_t cur_memory_size_;
+  memory::SpinLock spinlock_;
 };
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 3817a11b9afe4..7ced4853c2d8f 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -19,10 +19,13 @@
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     new_executor_sequential_run, false,
     "Enable sequential execution for standalone executor, used for debug");
+DECLARE_bool(run_pten_kernel);
+
 namespace paddle {
 namespace framework {
 namespace interpreter {
@@ -338,6 +341,8 @@ void build_op_func_list(const platform::Place& place,
       // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
       deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope);
     } else {
+      auto op_with_kernel =
+          static_cast<const framework::OperatorWithKernel*>(op);
       // construct RuntimeContext and analysis KernelType
       RuntimeContext runtime_context({}, {});
       runtime_context.inputs.swap(ins_map);
@@ -350,8 +355,7 @@ void build_op_func_list(const platform::Place& place,
         // TODO(Aurelius84): In case of control flow ops, they are NOT
         // inheritted
         // from OperatorWithKernel.
-        static_cast<const framework::OperatorWithKernel*>(op)->InferShape(
-            &infer_shape_ctx);
+        op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
       }
 
       auto kernels_iter = all_op_kernels.find(op->Type());
@@ -367,10 +371,8 @@ void build_op_func_list(const platform::Place& place,
           platform::DeviceContextPool::Instance();
       auto* dev_ctx = pool.Get(place);
       Scope scope;
-      auto expected_kernel_key =
-          dynamic_cast<const framework::OperatorWithKernel*>(op)
-              ->GetExpectedKernelType(
-                  ExecutionContext(*op, scope, *dev_ctx, runtime_context));
+      auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(
+          ExecutionContext(*op, scope, *dev_ctx, runtime_context));
 
       // change device by the device_guard()
       apply_device_guard(op, place, &expected_kernel_key);
@@ -378,10 +380,16 @@ void build_op_func_list(const platform::Place& place,
 
       // step 3. apply data transforms and insert data transfer ops
       VariableValueMap& ins_map_temp = runtime_context.inputs;
+
+      // NOTE(zhiqiu): op_func_node->operator_base_ maybe changed in
+      // ApplyDataTransform
       ApplyDataTransform(expected_kernel_key, place, &ins_map_temp, var_scope,
                          &op_func_node, vec_func_list, use_local_scope);
+      op_with_kernel = static_cast<const framework::OperatorWithKernel*>(
+          op_func_node.operator_base_.get());
+
       // step 4. Run op kernel
-      VLOG(3) << op->Type()
+      VLOG(3) << op_with_kernel->Type()
               << " : expected_kernel_key : " << expected_kernel_key;
 
       if (platform::is_gpu_place(expected_kernel_key.place_)) {
@@ -397,7 +405,8 @@ void build_op_func_list(const platform::Place& place,
       }
       op_func_node.dev_ctx_ = dev_ctx;
 
-      auto exec_ctx = ExecutionContext(*op, scope, *dev_ctx, runtime_context);
+      auto exec_ctx =
+          ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
 
       auto kernel_iter = kernels.find(expected_kernel_key);
       PADDLE_ENFORCE_NE(
@@ -406,8 +415,27 @@ void build_op_func_list(const platform::Place& place,
               "Operator (%s) does not have kernel for %s.", op->Type(),
               KernelTypeToString(expected_kernel_key)));
 
-      op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
-      op_func_node.kernel_func_(exec_ctx);
+      auto run_pten_kernel = false;
+
+      if (FLAGS_run_pten_kernel &&
+          pten::KernelFactory::Instance().HasCompatiblePtenKernel(
+              op_with_kernel->Type())) {
+        op_with_kernel->ChoosePtenKernel(exec_ctx);
+        run_pten_kernel = op_with_kernel->PtenKernel()->IsValid();
+      }
+
+      if (run_pten_kernel) {
+        op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx);
+        op_func_node.pt_kernel_ = op_with_kernel->PtenKernel();
+        op_func_node.pt_kernel_context_ = op_with_kernel->PtenKernelContext();
+
+        (*op_func_node.pt_kernel_)(op_func_node.pt_kernel_context_);
+        op_with_kernel->WriteBackToOutputs(&runtime_context);
+        op_func_node.pt_kernel_context_->ClearData();
+      } else {
+        op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
+        op_func_node.kernel_func_(exec_ctx);
+      }
 
       // post-process grad_op.outputs if need cast complex grad into real grad.
       // NOTE(Aurelius84): insert a transfer_dtype_op inplacely to cast it.
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 8f27c7e1811fb..5f403613c6b30 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -32,8 +32,8 @@
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
-#include "paddle/fluid/framework/new_executor/workqueue.h"
-#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -60,13 +60,15 @@ class AsyncWorkQueue {
     // for execute host Kernel
     group_options.emplace_back(/*num_threads*/ host_num_threads,
                                /*allow_spinning*/ true,
-                               /*track_task*/ true,
-                               /*queue_empty_waiter*/ waiter);
+                               /*track_task*/ false,
+                               /*detached*/ true,
+                               /*events_waiter*/ waiter);
     // for launch device Kernel
     group_options.emplace_back(/*num_threads*/ 1,
                                /*allow_spinning*/ true,
-                               /*track_task*/ true,
-                               /*queue_empty_waiter*/ waiter);
+                               /*track_task*/ false,
+                               /*detached*/ true,
+                               /*events_waiter*/ waiter);
     queue_group_ = CreateWorkQueueGroup(group_options);
   }
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 73f16fe3e9cc7..4b9404fd178fd 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -673,6 +673,14 @@ OpKernelComputeFunc Instruction::KernelFunc() const {
   return op_func_node_.kernel_func_;
 }
 
+pten::Kernel* Instruction::PtenKernel() const {
+  return op_func_node_.pt_kernel_;
+}
+
+pten::KernelContext* Instruction::PtenKernelContext() const {
+  return op_func_node_.pt_kernel_context_;
+}
+
 OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }
 
 OperatorBase* Instruction::OpBase() const {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index d691a75a6d35b..ca49e7f5670d6 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -295,6 +295,11 @@ struct OpFuncNode {
 
   OpKernelComputeFunc kernel_func_;
   platform::DeviceContext* dev_ctx_;  // not owned
+
+  // fit for pten kernel
+  pten::Kernel* pt_kernel_{nullptr};                 // not owned
+  pten::KernelContext* pt_kernel_context_{nullptr};  // not onwed
+
   OpFuncType type_;
 };
 
@@ -313,6 +318,10 @@ class Instruction {
 
   OpKernelComputeFunc KernelFunc() const;
 
+  pten::Kernel* PtenKernel() const;
+
+  pten::KernelContext* PtenKernelContext() const;
+
   OpFuncType KernelType() const;
 
   OperatorBase* OpBase() const;
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 6876f219c92b9..b42f2da2a4d78 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -71,7 +71,6 @@ ProgramDesc load_from_file(const std::string& file_name) {
   fin.seekg(0, std::ios::beg);
   fin.read(&buffer[0], buffer.size());
   fin.close();
-
   ProgramDesc program_desc(buffer);
   return program_desc;
 }
diff --git a/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
new file mode 100644
index 0000000000000..77130102d52e5
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/workqueue/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc events_waiter.cc DEPS enforce glog)
+cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
diff --git a/paddle/fluid/framework/new_executor/event_count.h b/paddle/fluid/framework/new_executor/workqueue/event_count.h
similarity index 98%
rename from paddle/fluid/framework/new_executor/event_count.h
rename to paddle/fluid/framework/new_executor/workqueue/event_count.h
index 7f1e3670056fc..893c6d2d54ac7 100644
--- a/paddle/fluid/framework/new_executor/event_count.h
+++ b/paddle/fluid/framework/new_executor/workqueue/event_count.h
@@ -41,6 +41,10 @@
 // and won't block, or notifying thread will see state_ change and will unblock
 // the waiter, or both. But it can't happen that both threads don't see each
 // other changes, which would lead to deadlock.
+//
+// What changed by PaddlePaddle
+//   1. Allocate aligned storage for Waiters to get better performance.
+//   2. Replace Eigen utils with std utils.
 
 #pragma once
 
diff --git a/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc b/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc
new file mode 100644
index 0000000000000..ac45e7b5fdfe9
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/workqueue/events_waiter.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/workqueue/events_waiter.h"
+#include <glog/logging.h>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+EventsWaiter::EventsWaiter()
+    : trigger_event_(nullptr), counter_(0), waiting_(false), cv_(1) {}
+
+std::shared_ptr<EventsWaiter::EventNotifier> EventsWaiter::RegisterEvent(
+    const std::string& name, EventChecker checker) {
+  auto counter = counter_.fetch_add(1);
+  auto id = std::hash<std::string>()(name + std::to_string(counter));
+  VLOG(10) << "Register event id:" << id << " name:" << name;
+  auto notifier = std::shared_ptr<EventNotifier>(new EventNotifier(id, this));
+  EventInfo evt{id, name, TriggerType::LevelTriggered, std::move(checker)};
+  std::lock_guard<paddle::memory::SpinLock> guard(events_lock_);
+  events_[id] = std::move(evt);
+  return notifier;
+}
+
+std::shared_ptr<EventsWaiter::EventNotifier> EventsWaiter::RegisterEvent(
+    const std::string& name) {
+  auto counter = counter_.fetch_add(1);
+  auto id = std::hash<std::string>()(name + std::to_string(counter));
+  VLOG(10) << "Register event id:" << id << " name:" << name;
+  auto notifier = std::shared_ptr<EventNotifier>(new EventNotifier(id, this));
+  EventInfo evt{id, name, TriggerType::EdgeTriggered, []() { return false; }};
+  std::lock_guard<paddle::memory::SpinLock> guard(events_lock_);
+  events_[id] = std::move(evt);
+  return notifier;
+}
+
+void EventsWaiter::UnregisterEvent(const EventId& id) {
+  VLOG(10) << "Unregister event id:" << id;
+  std::lock_guard<paddle::memory::SpinLock> guard(events_lock_);
+  events_.erase(id);
+}
+
+std::string EventsWaiter::WaitEvent() {
+  // only one user can wait at any time
+  bool waiting = false;
+  if (!waiting_.compare_exchange_strong(waiting, true,
+                                        std::memory_order_seq_cst,
+                                        std::memory_order_relaxed)) {
+    PADDLE_THROW(
+        platform::errors::ResourceExhausted("Another thread is waiting."));
+  }
+  auto w = cv_.GetWaiter(0);
+  cv_.Prewait();
+  std::string* triggered = trigger_event_;
+  if (triggered == nullptr) {
+    // checkers
+    {
+      std::lock_guard<paddle::memory::SpinLock> guard(events_lock_);
+      for (auto& kv : events_) {
+        auto& evt = kv.second;
+        if (TriggerType::LevelTriggered == evt.type && evt.checker()) {
+          triggered = new std::string(evt.name);
+          break;
+        }
+      }
+    }
+    if (triggered != nullptr) {
+      std::string* prev = nullptr;
+      if (!trigger_event_.compare_exchange_strong(prev, triggered,
+                                                  std::memory_order_seq_cst,
+                                                  std::memory_order_relaxed)) {
+        delete triggered;
+        triggered = prev;
+      }
+    }
+  }
+  if (triggered) {
+    cv_.CancelWait();
+  } else {
+    cv_.CommitWait(w);
+    triggered = trigger_event_;
+  }
+  trigger_event_.store(nullptr, std::memory_order_relaxed);
+  waiting_.store(false);
+  auto trigger_event = *triggered;
+  delete triggered;
+  return trigger_event;
+}
+
+int EventsWaiter::Clear() {
+  bool waiting = false;
+  if (!waiting_.compare_exchange_strong(waiting, true,
+                                        std::memory_order_seq_cst,
+                                        std::memory_order_relaxed)) {
+    return -1;
+  }
+  trigger_event_.store(nullptr, std::memory_order_relaxed);
+  waiting_.store(false);
+  return 0;
+}
+
+void EventsWaiter::TriggerEvent(const EventId& id) {
+  VLOG(10) << "Try to trigger event id:" << id;
+  std::string* trigger_event = new std::string;
+  {
+    std::lock_guard<paddle::memory::SpinLock> guard(events_lock_);
+    auto iter = events_.find(id);
+    if (iter == events_.end()) {
+      delete trigger_event;
+      return;
+    }
+    *trigger_event = iter->second.name;
+  }
+  std::string* prev = nullptr;
+  if (!trigger_event_.compare_exchange_strong(prev, trigger_event,
+                                              std::memory_order_seq_cst,
+                                              std::memory_order_relaxed)) {
+    delete trigger_event;
+    return;
+  }
+  VLOG(10) << "Triggered event id:" << id << " name:" << *trigger_event;
+  cv_.Notify(true);
+}
+
+std::string EventsWaiter::GetEventName(const EventId& id) {
+  std::lock_guard<paddle::memory::SpinLock> guard(events_lock_);
+  auto iter = events_.find(id);
+  if (iter == events_.end()) {
+    return "Unregistered";
+  }
+  return iter->second.name;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/workqueue/events_waiter.h b/paddle/fluid/framework/new_executor/workqueue/events_waiter.h
new file mode 100644
index 0000000000000..5ffed15155d59
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/workqueue/events_waiter.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/new_executor/workqueue/event_count.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
+
+namespace paddle {
+namespace framework {
+
+// A multiplexing waiter, be able to wait multiple kinds of events
+// simultaneously.
+// Muti-Producer single-consumer single-slot message-queue.
+class EventsWaiter {
+ public:
+  using EventId = std::size_t;
+
+  using EventChecker = std::function<bool()>;
+
+  // Make sure EventsWaiter has a longer lifetime than EventNotifier.
+  class EventNotifier {
+   public:
+    void NotifyEvent() { waiter_.TriggerEvent(id_); }
+
+    void UnregisterEvent() { waiter_.UnregisterEvent(id_); }
+
+    EventId GetEventId() { return id_; }
+
+    // return "Unregistered" if the corresponding event was unregistered.
+    std::string GetEventName() { return waiter_.GetEventName(id_); }
+
+   private:
+    friend EventsWaiter;
+    EventNotifier(EventId id, EventsWaiter* waiter)
+        : id_(id), waiter_(*waiter) {}
+    EventNotifier(const EventNotifier&) = delete;
+    void operator=(const EventNotifier&) = delete;
+
+    EventId id_;
+    EventsWaiter& waiter_;
+  };
+
+  EventsWaiter();
+  EventsWaiter(const EventsWaiter&) = delete;
+  EventsWaiter& operator=(const EventsWaiter&) = delete;
+
+  // Register a level-triggered event. If the checker returns true or
+  // EventNotifier::NotifyEvent is called, the corresponding event will be
+  // distributed.
+  std::shared_ptr<EventNotifier> RegisterEvent(const std::string& name,
+                                               EventChecker checker);
+
+  // Register an edge-triggered event. The corresponding event will be
+  // distributed when EventNotifier::NotifyEvent is called.
+  std::shared_ptr<EventNotifier> RegisterEvent(const std::string& name);
+
+  void UnregisterEvent(const EventId& id);
+
+  // Blocking the calling thread to wait any of the registered events.
+  std::string WaitEvent();
+
+  // Nonblocking.
+  // Clear the slot, no matter whether there is an event.
+  // Return value:
+  //     -1 : another thread is waiting.
+  //      0 : succ.
+  int Clear();
+
+ private:
+  friend EventNotifier;
+
+  enum class TriggerType { LevelTriggered, EdgeTriggered };
+
+  struct EventInfo {
+    EventId id;
+    std::string name;
+    TriggerType type;
+    EventChecker checker;
+  };
+
+  void TriggerEvent(const EventId& id);
+
+  std::string GetEventName(const EventId& id);
+
+  std::unordered_map<EventId, EventInfo> events_;
+  paddle::memory::SpinLock events_lock_;
+  std::atomic<std::string*> trigger_event_;
+  std::atomic<uint64_t> counter_;
+  std::atomic<bool> waiting_;
+  EventCount cv_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
similarity index 94%
rename from paddle/fluid/framework/new_executor/nonblocking_threadpool.h
rename to paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index cdcdbbb445185..37044d3c19b35 100644
--- a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -12,43 +12,14 @@
 #include <atomic>
 #include <cstdlib>
 #include <vector>
-#include "paddle/fluid/framework/new_executor/event_count.h"
-#include "paddle/fluid/framework/new_executor/run_queue.h"
-#include "paddle/fluid/framework/new_executor/thread_environment.h"
+#include "paddle/fluid/framework/new_executor/workqueue/event_count.h"
+#include "paddle/fluid/framework/new_executor/workqueue/run_queue.h"
+#include "paddle/fluid/framework/new_executor/workqueue/thread_environment.h"
+#include "paddle/fluid/platform/os_info.h"
 
 namespace paddle {
 namespace framework {
 
-template <typename Notifier>
-class TaskTracker {
- public:
-  TaskTracker() = default;
-
-  explicit TaskTracker(Notifier& notifier) : notifier_(&notifier) {}
-
-  TaskTracker(const TaskTracker&) = delete;
-
-  TaskTracker& operator=(const TaskTracker&) = delete;
-
-  ~TaskTracker() = default;
-
-  void AddCounter() { num_tasks_.fetch_add(1, std::memory_order_relaxed); }
-
-  void SubCounter() {
-    if (1 == num_tasks_.fetch_sub(1, std::memory_order_relaxed)) {
-      if (notifier_ != nullptr) {
-        notifier_->NotifyEvent();
-      }
-    }
-  }
-
-  uint64_t PendingTaskNum() { return num_tasks_.load(); }
-
- private:
-  alignas(64) std::atomic<uint64_t> num_tasks_{0};
-  Notifier* notifier_{nullptr};
-};
-
 template <typename Environment>
 class ThreadPoolTempl {
  public:
diff --git a/paddle/fluid/framework/new_executor/run_queue.h b/paddle/fluid/framework/new_executor/workqueue/run_queue.h
similarity index 97%
rename from paddle/fluid/framework/new_executor/run_queue.h
rename to paddle/fluid/framework/new_executor/workqueue/run_queue.h
index e457b20a3c35d..2fc42cf308ab8 100644
--- a/paddle/fluid/framework/new_executor/run_queue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/run_queue.h
@@ -29,6 +29,11 @@
 // separate state variable as null/non-null pointer value would serve as state,
 // but that would require malloc/free per operation for large, complex values
 // (and this is designed to store std::function<()>).
+//
+// What changed by PaddlePaddle
+//   1. Use paddle::memory::SpinLock instead of std::mutex to protect back_.
+//   2. Make front_/back_ aligned to get better performance.
+//   3. Replace Eigen utils with std utils.
 
 #pragma once
 
@@ -37,7 +42,7 @@
 #include <cstdint>
 #include <mutex>
 #include <vector>
-#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/thread_environment.h b/paddle/fluid/framework/new_executor/workqueue/thread_environment.h
similarity index 100%
rename from paddle/fluid/framework/new_executor/thread_environment.h
rename to paddle/fluid/framework/new_executor/workqueue/thread_environment.h
diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
similarity index 77%
rename from paddle/fluid/framework/new_executor/workqueue.cc
rename to paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 7607b3a297f84..3f06f3db23118 100644
--- a/paddle/fluid/framework/new_executor/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -4,9 +4,9 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#include "paddle/fluid/framework/new_executor/workqueue.h"
-#include "paddle/fluid/framework/new_executor/nonblocking_threadpool.h"
-#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
+#include "paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -18,24 +18,35 @@ using TaskTracker = TaskTracker<EventsWaiter::EventNotifier>;
 class WorkQueueImpl : public WorkQueue {
  public:
   explicit WorkQueueImpl(const WorkQueueOptions& options) : WorkQueue(options) {
-    if (options_.track_task && options.queue_empty_waiter != nullptr) {
+    if (options_.track_task && options.events_waiter != nullptr) {
       void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker));
       TaskTracker* tracker = reinterpret_cast<TaskTracker*>(storage);
-      auto notifier = options.queue_empty_waiter->RegisterEvent(
+      empty_notifier_ = options.events_waiter->RegisterEvent(
           kQueueEmptyEvent,
           [tracker]() { return tracker->PendingTaskNum() == 0; });
-      tracker_ = new (storage) TaskTracker(*notifier.get());
+      tracker_ = new (storage) TaskTracker(*empty_notifier_.get());
+    }
+    if (options_.detached == false && options.events_waiter != nullptr) {
+      destruct_notifier_ =
+          options.events_waiter->RegisterEvent(kQueueDestructEvent);
     }
     queue_ = new NonblockingThreadPool(options_.num_threads,
                                        options_.allow_spinning);
   }
 
   virtual ~WorkQueueImpl() {
+    if (empty_notifier_) {
+      empty_notifier_->UnregisterEvent();
+    }
+    delete queue_;
     if (tracker_ != nullptr) {
       tracker_->~TaskTracker();
       AlignedFree(tracker_);
     }
-    delete queue_;
+    if (destruct_notifier_) {
+      destruct_notifier_->NotifyEvent();
+      destruct_notifier_->UnregisterEvent();
+    }
   }
 
   void AddTask(std::function<void()> fn) override {
@@ -59,6 +70,8 @@ class WorkQueueImpl : public WorkQueue {
  private:
   NonblockingThreadPool* queue_{nullptr};
   TaskTracker* tracker_{nullptr};
+  std::shared_ptr<EventsWaiter::EventNotifier> empty_notifier_;
+  std::shared_ptr<EventsWaiter::EventNotifier> destruct_notifier_;
 };
 
 class WorkQueueGroupImpl : public WorkQueueGroup {
@@ -80,6 +93,8 @@ class WorkQueueGroupImpl : public WorkQueueGroup {
   std::vector<NonblockingThreadPool*> queues_;
   NonblockingThreadPool* queues_storage_;
   TaskTracker* tracker_;
+  std::shared_ptr<EventsWaiter::EventNotifier> empty_notifier_;
+  std::shared_ptr<EventsWaiter::EventNotifier> destruct_notifier_;
 };
 
 WorkQueueGroupImpl::WorkQueueGroupImpl(
@@ -94,13 +109,17 @@ WorkQueueGroupImpl::WorkQueueGroupImpl(
   for (size_t idx = 0; idx < num_queues; ++idx) {
     const auto& options = queues_options_[idx];
     if (options.track_task && tracker_ == nullptr &&
-        options.queue_empty_waiter != nullptr) {
+        options.events_waiter != nullptr) {
       void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker));
       TaskTracker* tracker = reinterpret_cast<TaskTracker*>(storage);
-      auto notifier = options.queue_empty_waiter->RegisterEvent(
+      empty_notifier_ = options.events_waiter->RegisterEvent(
           kQueueEmptyEvent,
           [tracker]() { return tracker->PendingTaskNum() == 0; });
-      tracker_ = new (storage) TaskTracker(*notifier.get());
+      tracker_ = new (storage) TaskTracker(*empty_notifier_.get());
+    }
+    if (options.detached == false && options.events_waiter != nullptr) {
+      destruct_notifier_ =
+          options.events_waiter->RegisterEvent(kQueueDestructEvent);
     }
     queues_[idx] = new (&queues_storage_[idx])
         NonblockingThreadPool(options.num_threads, options.allow_spinning);
@@ -108,6 +127,9 @@ WorkQueueGroupImpl::WorkQueueGroupImpl(
 }
 
 WorkQueueGroupImpl::~WorkQueueGroupImpl() {
+  if (empty_notifier_) {
+    empty_notifier_->UnregisterEvent();
+  }
   for (auto queue : queues_) {
     queue->~NonblockingThreadPool();
   }
@@ -116,6 +138,10 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() {
     AlignedFree(tracker_);
   }
   free(queues_storage_);
+  if (destruct_notifier_) {
+    destruct_notifier_->NotifyEvent();
+    destruct_notifier_->UnregisterEvent();
+  }
 }
 
 void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
diff --git a/paddle/fluid/framework/new_executor/workqueue.h b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
similarity index 87%
rename from paddle/fluid/framework/new_executor/workqueue.h
rename to paddle/fluid/framework/new_executor/workqueue/workqueue.h
index a299d0aaed7d2..068c54a21a452 100644
--- a/paddle/fluid/framework/new_executor/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
@@ -22,6 +22,7 @@ namespace paddle {
 namespace framework {
 
 constexpr const char* kQueueEmptyEvent = "QueueEmpty";
+constexpr const char* kQueueDestructEvent = "QueueDestruct";
 
 class EventsWaiter;
 
@@ -32,20 +33,24 @@ struct WorkQueueOptions {
         track_task(track_task) {}
 
   WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task,
-                   EventsWaiter* waiter)
+                   bool detached, EventsWaiter* waiter)
       : num_threads(num_threads),
         allow_spinning(allow_spinning),
         track_task(track_task),
-        queue_empty_waiter(waiter) {}
+        detached(detached),
+        events_waiter(waiter) {}
 
   size_t num_threads;
   bool allow_spinning;
   // If you need to blocking the calling  thread to wait "queue empty", set
-  // track_task = true and set queue_empty_waiter. EventsWaiter::WaitEvent will
+  // track_task = true and set events_waiter. EventsWaiter::WaitEvent will
   // block the calling thread until any of events (including "queue empty")
   // occured.
   bool track_task;
-  EventsWaiter* queue_empty_waiter{nullptr};  // not owned
+  // If you need to be noticed when a WorkQueue Destruct() , set detached =
+  // false and set events_waiter.
+  bool detached{true};
+  EventsWaiter* events_waiter{nullptr};  // not owned
 };
 
 class WorkQueue {
diff --git a/paddle/fluid/framework/new_executor/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
similarity index 73%
rename from paddle/fluid/framework/new_executor/workqueue_test.cc
rename to paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
index 3ea0096b631e8..e06beb623be4c 100644
--- a/paddle/fluid/framework/new_executor/workqueue_test.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_test.cc
@@ -12,11 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/new_executor/workqueue.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
 #include <atomic>
 #include "glog/logging.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
+
+TEST(WorkQueueUtils, TestEventsWaiter) {
+  using paddle::framework::EventsWaiter;
+  EventsWaiter events_waiter;
+  auto notifier =
+      events_waiter.RegisterEvent("test_register_lt", []() { return true; });
+  EXPECT_EQ(events_waiter.WaitEvent(), "test_register_lt");
+  EXPECT_EQ(notifier->GetEventName(), "test_register_lt");
+  EXPECT_EQ(events_waiter.WaitEvent(), "test_register_lt");
+  notifier->UnregisterEvent();
+  EXPECT_EQ(notifier->GetEventName(), "Unregistered");
+  notifier = events_waiter.RegisterEvent("test_register_et");
+  notifier->NotifyEvent();
+  EXPECT_EQ(events_waiter.WaitEvent(), "test_register_et");
+}
 
 TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   VLOG(1) << "In Test";
@@ -30,7 +45,8 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) {
   // CreateSingleThreadedWorkQueue
   EventsWaiter events_waiter;
   WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true,
-                           /*track_task*/ true, &events_waiter);
+                           /*track_task*/ true, /*detached*/ true,
+                           &events_waiter);
   auto work_queue = CreateSingleThreadedWorkQueue(options);
   // NumThreads
   EXPECT_EQ(work_queue->NumThreads(), 1u);
@@ -63,7 +79,8 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   // CreateMultiThreadedWorkQueue
   EventsWaiter events_waiter;
   WorkQueueOptions options(/*num_threads*/ 10, /*allow_spinning*/ true,
-                           /*track_task*/ true, &events_waiter);
+                           /*track_task*/ true, /*detached*/ false,
+                           &events_waiter);
   auto work_queue = CreateMultiThreadedWorkQueue(options);
   // NumThreads
   EXPECT_EQ(work_queue->NumThreads(), 10u);
@@ -80,11 +97,13 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) {
   }
   // WaitQueueEmpty
   EXPECT_EQ(finished.load(), false);
-  events_waiter.WaitEvent();
+  EXPECT_EQ(events_waiter.WaitEvent(), paddle::framework::kQueueEmptyEvent);
   EXPECT_EQ(finished.load(), true);
   EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum);
   // Cancel
   work_queue->Cancel();
+  work_queue.reset();
+  EXPECT_EQ(events_waiter.WaitEvent(), paddle::framework::kQueueDestructEvent);
 }
 
 TEST(WorkQueue, TestWorkQueueGroup) {
@@ -99,9 +118,11 @@ TEST(WorkQueue, TestWorkQueueGroup) {
   // ThreadedWorkQueueGroup
   EventsWaiter events_waiter;
   WorkQueueOptions sq_options(/*num_threads*/ 1, /*allow_spinning*/ true,
-                              /*track_task*/ true, &events_waiter);
+                              /*track_task*/ true, /*detached*/ false,
+                              &events_waiter);
   WorkQueueOptions mq_options(/*num_threads*/ 10, /*allow_spinning*/ true,
-                              /*track_task*/ true, &events_waiter);
+                              /*track_task*/ true, /*detached*/ false,
+                              &events_waiter);
   auto queue_group = CreateWorkQueueGroup({sq_options, mq_options});
   // NumThreads
   EXPECT_EQ(queue_group->QueueNumThreads(0), 1u);
@@ -126,4 +147,7 @@ TEST(WorkQueue, TestWorkQueueGroup) {
   EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum + kLoopNum);
   // Cancel
   queue_group->Cancel();
+  events_waiter.WaitEvent();
+  queue_group.reset();
+  EXPECT_EQ(events_waiter.WaitEvent(), paddle::framework::kQueueDestructEvent);
 }
diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
similarity index 50%
rename from paddle/fluid/framework/new_executor/workqueue_utils.cc
rename to paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
index 2c81cffb49d82..82dcbbd509dd5 100644
--- a/paddle/fluid/framework/new_executor/workqueue_utils.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
+#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
 #include <cstdint>
 #include <cstdlib>
 
@@ -55,62 +55,5 @@ void AlignedFree(void* mem_ptr) {
 #endif
 }
 
-constexpr EventsWaiter::EventId kEmptyEventId = -1;
-
-EventsWaiter::EventsWaiter()
-    : trigger_event_(kEmptyEventId), waiting_(false), cv_(1) {}
-
-std::shared_ptr<EventsWaiter::EventNotifier> EventsWaiter::RegisterEvent(
-    const std::string& name, EventChecker checker) {
-  names_.emplace_back(name);
-  checkers_.emplace_back(std::move(checker));
-  EventId id = checkers_.size() - 1;
-  auto notifier = std::shared_ptr<EventNotifier>(new EventNotifier(id, this));
-  notifiers_.emplace_back(notifier);
-  return notifier;
-}
-
-std::string EventsWaiter::WaitEvent() {
-  // only one user can wait at any time
-  bool waiting = false;
-  if (!waiting_.compare_exchange_strong(waiting, true,
-                                        std::memory_order_seq_cst,
-                                        std::memory_order_relaxed)) {
-    PADDLE_THROW(
-        platform::errors::ResourceExhausted("Another thread is waiting."));
-  }
-  EventId id = kEmptyEventId;
-  auto w = cv_.GetWaiter(0);
-  cv_.Prewait();
-  int64_t event_num = checkers_.size();
-  for (int64_t i = 0; id == kEmptyEventId && i < event_num; ++i) {
-    if (checkers_[i]()) {
-      id = i;
-    }
-  }
-  if (id != kEmptyEventId) {
-    cv_.CancelWait();
-  } else {
-    cv_.CommitWait(w);
-    id = trigger_event_.load(std::memory_order_relaxed);
-  }
-  trigger_event_.store(kEmptyEventId, std::memory_order_relaxed);
-  waiting_.store(false);
-  return names_.at(id);
-}
-
-void EventsWaiter::SetTriggerEvent(const EventId& id) {
-  trigger_event_.store(id, std::memory_order_relaxed);
-  cv_.Notify(true);
-}
-
-std::string EventsWaiter::EventNotifier::GetEventName() {
-  return waiter_.names_.at(id_);
-}
-
-void EventsWaiter::EventNotifier::NotifyEvent() {
-  waiter_.SetTriggerEvent(id_);
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
similarity index 59%
rename from paddle/fluid/framework/new_executor/workqueue_utils.h
rename to paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
index a06d9f319dfee..eee64df285dcb 100644
--- a/paddle/fluid/framework/new_executor/workqueue_utils.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h
@@ -21,8 +21,7 @@
 #include <memory>
 #include <set>
 #include <string>
-#include <vector>
-#include "paddle/fluid/framework/new_executor/event_count.h"
+#include "paddle/fluid/framework/new_executor/workqueue/events_waiter.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -69,55 +68,34 @@ void* AlignedMalloc(size_t size, size_t alignment);
 
 void AlignedFree(void* memory_ptr);
 
-// A multiplexing waiter, be able to wait multi events simultaneously.
-// Blocking the calling thread to wait any of the registered events.
-// Non-thread-safe.
-class EventsWaiter {
+template <typename Notifier>
+class TaskTracker {
  public:
-  using EventId = int64_t;
+  TaskTracker() = default;
 
-  using EventChecker = std::function<bool()>;
+  explicit TaskTracker(Notifier& notifier) : notifier_(&notifier) {}
 
-  class EventNotifier {
-   public:
-    void NotifyEvent();
+  TaskTracker(const TaskTracker&) = delete;
 
-    EventId GetEventId() { return id_; }
+  TaskTracker& operator=(const TaskTracker&) = delete;
 
-    std::string GetEventName();
+  ~TaskTracker() = default;
 
-   private:
-    friend EventsWaiter;
-    EventNotifier(EventId id, EventsWaiter* waiter)
-        : id_(id), waiter_(*waiter) {}
+  void AddCounter() { num_tasks_.fetch_add(1, std::memory_order_relaxed); }
 
-    EventId id_;
-    EventsWaiter& waiter_;
-  };
-
-  EventsWaiter();
-
-  EventsWaiter(const EventsWaiter&) = delete;
-
-  EventsWaiter& operator=(const EventsWaiter&) = delete;
-
-  // All the RegisterEvent functions must be called before any WaitEvent
-  std::shared_ptr<EventNotifier> RegisterEvent(const std::string& name,
-                                               EventChecker checker);
+  void SubCounter() {
+    if (1 == num_tasks_.fetch_sub(1, std::memory_order_relaxed)) {
+      if (notifier_ != nullptr) {
+        notifier_->NotifyEvent();
+      }
+    }
+  }
 
-  // Wait any of the registered events
-  std::string WaitEvent();
+  uint64_t PendingTaskNum() { return num_tasks_.load(); }
 
  private:
-  friend EventNotifier;
-  void SetTriggerEvent(const EventId& id);
-
-  std::vector<std::string> names_;
-  std::vector<EventChecker> checkers_;
-  std::vector<std::shared_ptr<EventNotifier>> notifiers_;
-  std::atomic<EventId> trigger_event_;
-  std::atomic<bool> waiting_;
-  EventCount cv_;
+  alignas(64) std::atomic<uint64_t> num_tasks_{0};
+  Notifier* notifier_{nullptr};
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index f658c33b8bea1..e09daf6df646e 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -352,15 +352,9 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
   inputs_ = op_desc.inputs_;
   outputs_ = op_desc.outputs_;
   attrs_ = op_desc.attrs_;
+  // The record of original_id_ is only for auto parallel.
+  original_id_ = op_desc.original_id_;
   need_update_ = true;
-  // When creating graph from program, the creation of op node will create a new
-  // OpDesc instead of
-  // referring to the original one. To find the original OpDesc of the op node,
-  // the id have to be
-  // copied to the new OpDesc. The var node has the same situation, but the
-  // default copy constructor
-  // can copy the id automatically.
-  id_ = op_desc.id_;
 }
 
 OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 3e127e0b54a8a..c077073252499 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -156,17 +156,10 @@ class OpDesc {
 
   const BlockDesc *Block() const { return this->block_; }
 
-  // This thread-safe implementation seems to be redudent since the neural
-  // networks
-  // are usually constructed in a single thread
-  static uint64_t GenerateId() {
-    static std::atomic<std::uint64_t> id{0};
-    return ++id;
-  }
-
-  // Note: the identity only used as a key for referring to its
-  // distributed attribute now.
+  // The Id() and OrignalId() are only used for auto parallel.
   uint64_t Id() const { return id_; }
+  uint64_t OriginalId() const { return original_id_; }
+  void SetOriginalId(uint64_t original_id) { original_id_ = original_id; }
 
  private:
   template <typename MapType>
@@ -179,6 +172,14 @@ class OpDesc {
     return ret_val;
   }
 
+  // This thread-safe implementation seems to be redudent since the neural
+  // networks are usually constructed in a single thread
+  static uint64_t GenerateId() {
+    static std::atomic<std::uint64_t> uid{0};
+    // Must start from one
+    return ++uid;
+  }
+
   proto::OpDesc desc_;
   BlockDesc *block_{nullptr};  // not_own
   // input arg name => input variable names
@@ -191,7 +192,13 @@ class OpDesc {
   // local changes should be synchronized, need_update_ should be set to true.
   bool need_update_{false};
 
+  // Note: the id_ is unique (only for auto parallel).
   uint64_t id_ = GenerateId();
+  // Note: the orignal_id_ is used for referring to the original OpDesc
+  // that the current OpDesc is built from (only for auto parallel).
+  // The default original_id_ is same as the id_, which means the
+  // current OpDesc is not built from the other one.
+  uint64_t original_id_ = id_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 51734ba5f0d6a..18d22fbfaab38 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -56,6 +56,7 @@ DECLARE_bool(enable_unused_var_check);
 PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
                              "number of threads for inner op");
 DECLARE_bool(run_pten_kernel);
+DECLARE_bool(run_kp_kernel);
 
 namespace paddle {
 namespace framework {
@@ -1089,7 +1090,7 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
                                            const platform::Place& place,
                                            const RuntimeContext& ctx) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, ctx);
-  this->InferShape(&infer_shape_ctx);
+  this->Info().infer_shape_(&infer_shape_ctx);
 }
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
@@ -1177,6 +1178,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("infer_shape",
                                        platform::EventRole::kInnerOp);
     RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
+    // TODO(chenweihang): replace this after removing `this->IsMKLDNNType()`
+    // in some mkldnn infershape functions, such conv2d infershape
     this->InferShape(&infer_shape_ctx);
   }
 
@@ -1800,6 +1803,9 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
 
 void OperatorWithKernel::BuildPtenKernelContext(
     const RuntimeContext& ctx, platform::DeviceContext* dev_ctx) const {
+  if (pt_kernel_context_ == nullptr) {
+    pt_kernel_context_.reset(new pten::KernelContext());
+  }
   // TODO(chenweihang): now only work for very simple case,
   // many cases need to be deal with later:
   // 1. the input and output are not tensor
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 842ef0457d7bd..59bc4813d985b 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -555,6 +555,20 @@ class OperatorWithKernel : public OperatorBase {
   virtual KernelSignature GetExpectedPtenKernelArgs(
       const ExecutionContext& ctx) const;
 
+  /* member functions for adapting to pten lib */
+  void ChoosePtenKernel(const ExecutionContext& ctx) const;
+
+  void BuildPtenKernelContext(const RuntimeContext& ctx,
+                              platform::DeviceContext* dev_ctx) const;
+
+  void WriteBackToOutputs(RuntimeContext* ctx) const;
+
+  pten::Kernel* PtenKernel() const { return pt_kernel_.get(); }
+
+  pten::KernelContext* PtenKernelContext() const {
+    return pt_kernel_context_.get();
+  }
+
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
   void RunImpl(const Scope& scope, const platform::Place& place,
@@ -595,14 +609,6 @@ class OperatorWithKernel : public OperatorBase {
   Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                    const std::string& name) const;
 
-  /* member functions for adapting to pten lib */
-  void ChoosePtenKernel(const ExecutionContext& ctx) const;
-
-  void BuildPtenKernelContext(const RuntimeContext& ctx,
-                              platform::DeviceContext* dev_ctx) const;
-
-  void WriteBackToOutputs(RuntimeContext* ctx) const;
-
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
   mutable std::unique_ptr<OpKernelFunc> kernel_func_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ca86cda52fb61..9a38a2d5d6fe8 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -788,7 +788,7 @@ void ParallelExecutor::BCastParamsToDevices(
         void *buffer;
 
         if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
+          buffer = const_cast<void *>(main_tensor.data());
         } else {
           auto local_scope = member_->local_scopes_[i];
           auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
@@ -831,7 +831,7 @@ void ParallelExecutor::BCastParamsToDevices(
         void *buffer;
 
         if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
+          buffer = const_cast<void *>(main_tensor.data());
         } else {
           auto local_scope = member_->local_scopes_[i];
           auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 4a31adcca65ec..60b93f4a71664 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -101,20 +101,25 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE_EQ(desc_.ParseFromString(binary_str), true,
                     platform::errors::InvalidArgument(
                         "Failed to parse program_desc from binary string."));
+  VLOG(1) << 3333;
   InitFromProto();
 }
 
 void ProgramDesc::InitFromProto() {
+  VLOG(1) << 4444;
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  VLOG(1) << 5555;
   for (auto &block : blocks_) {
     for (auto *op : block->AllOps()) {
       for (const auto &attr : op->Proto()->attrs()) {
         if (attr.type() == proto::AttrType::BLOCK) {
+          VLOG(1) << 6666;
           size_t blk_idx = attr.block_idx();
           op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
         } else if (attr.type() == proto::AttrType::BLOCKS) {
+          VLOG(1) << 7777;
           auto blks_idx = attr.blocks_idx();
           std::vector<BlockDesc *> block_descs;
           for (int blk_idx : blks_idx) {
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 9521df651f9de..b8aedcce3e3fa 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -60,7 +60,7 @@ OpKernelType TransPtenKernelKeyToOpKernelType(
   proto::VarType::Type data_type =
       pten::TransToProtoVarType(kernel_key.dtype());
   platform::Place place = pten::TransToFluidPlace(kernel_key.backend());
-  DataLayout data_layout = pten::TransToFluidDataLayout(kernel_key.layout());
+  DataLayout data_layout = kernel_key.layout();
   LibraryType library_type = LibraryType::kPlain;
   if (kernel_key.backend() == pten::Backend::MKLDNN) {
     library_type = LibraryType::kMKLDNN;
@@ -83,8 +83,7 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey(
   } else {
     // do
   }
-  paddle::experimental::DataLayout layout =
-      pten::TransToPtenDataLayout(kernel_type.data_layout_);
+  paddle::experimental::DataLayout layout = kernel_type.data_layout_;
   paddle::experimental::DataType dtype =
       pten::TransToPtenDataType(kernel_type.data_type_);
   return pten::KernelKey(backend, layout, dtype);
@@ -99,7 +98,8 @@ KernelSignatureMap& KernelSignatureMap::Instance() {
     for (const auto& pair : OpInfoMap::Instance().map()) {
       const auto& op_type = pair.first;
       const auto* op_proto = pair.second.proto_;
-      if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+      if (pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) &&
+          op_proto != nullptr) {
         KernelArgsNameMakerByOpProto maker(op_proto);
         VLOG(10) << "Register kernel signature for " << op_type;
         auto success = kernel_signature_map_->map_
diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc
index 1731a974b71d8..0f1a8e2a9ed5f 100644
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -295,12 +295,12 @@ bool SaveTensorToDisk(const std::string& file_name,
     // save tensor
     uint64_t data_size =
         tensor->numel() * framework::SizeOfType(tensor->type());
-    auto* data_ptr = tensor->data<void>();
+    auto* data_ptr = tensor->data();
     if (platform::is_gpu_place(tensor->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-      data_ptr = temp.data<void>();
+      data_ptr = temp.data();
 #else
       PADDLE_THROW(platform::errors::Unavailable(
           "Tensor is in CUDA device, but paddle not compiled with CUDA."));
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index cbbc020989d1e..e5dfe28be7a3c 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/pten/api/lib/utils/storage.h"
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
@@ -26,152 +27,55 @@ class Allocation;
 
 namespace paddle {
 namespace framework {
-extern size_t SizeOfType(proto::VarType::Type type);
-void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet(
-                                       "Tensor holds no memory. "
-                                       "Call Tensor::mutable_data firstly."));
-  size_t size = numel() * SizeOfType(type());
-
-  PADDLE_ENFORCE_LE(
-      size, memory_size(),
-      platform::errors::PreconditionNotMet(
-          "Tensor's dimension is out of bound."
-          "Tensor's dimension must be equal or less than the size of its "
-          "memory."
-          "But received  Tensor's dimension is d%, memory's size is %d.",
-          size, memory_size()));
-}
-
-Tensor::Tensor(const proto::VarType::Type& dtype)
-    : type_(dtype),
-      offset_(0),
-      inplace_version_counter_(std::make_shared<TensorInplaceVersion>(0)) {}
-
-size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
-
-void* Tensor::mutable_data(const platform::Place& place,
-                           proto::VarType::Type type, size_t requested_size) {
-  type_ = type;
-  PADDLE_ENFORCE_GE(
-      numel(), 0,
-      platform::errors::PreconditionNotMet(
-          "The Tensor's element number must be equal or greater than zero. "
-          "The Tensor's shape is [",
-          dims(), "] now"));
-  size_t size = numel() * SizeOfType(type);
-  if (requested_size && (requested_size > size)) {
-    size = requested_size;
-  }
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    // Reset holder first before re-allocate to save memory
-    holder_.reset();
-    holder_ = memory::AllocShared(place, size);
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-void* Tensor::mutable_data(const platform::Place& place,
-                           size_t requested_size) {
-  PADDLE_ENFORCE_NOT_NULL(this->holder_, platform::errors::PreconditionNotMet(
-                                             "The tensor is not initialized."));
-  return mutable_data(place, type_, requested_size);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-void* Tensor::mutable_data(const platform::CUDAPlace& place,
-                           proto::VarType::Type type,
-                           const gpuStream_t& stream) {
-  if (!FLAGS_use_stream_safe_cuda_allocator) {
-    return mutable_data(place, type);
-  }
-
-  type_ = type;
-  PADDLE_ENFORCE_GE(
-      numel(), 0,
-      platform::errors::PreconditionNotMet(
-          "The Tensor's element number must be equal or greater than zero. "
-          "The Tensor's shape is [",
-          dims(), "] now"));
-  size_t size = numel() * SizeOfType(type);
-
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    holder_.reset();
-    holder_ = memory::AllocShared(place, size, stream);
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-#endif
-
-Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-Tensor& Tensor::ShareInplaceVersionCounterWith(const Tensor& src) {
-  PADDLE_ENFORCE_NOT_NULL(
-      inplace_version_counter_,
-      platform::errors::PreconditionNotMet(
-          "Tensor does not hold inplace_version_counter_."));
-
-  inplace_version_counter_ = src.inplace_version_counter_;
-  return *this;
-}
 
 Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(
-      begin_idx, 0,
-      platform::errors::OutOfRange("The start row index must be greater than 0."
-                                   "But received the start index is d%.",
-                                   begin_idx));
-  PADDLE_ENFORCE_LE(
-      end_idx, dims_[0],
-      platform::errors::OutOfRange("The end row index is out of bound."));
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    paddle::platform::errors::OutOfRange(
+                        "The start row index must be greater than 0."
+                        "But received the start index is d%.",
+                        begin_idx));
+  PADDLE_ENFORCE_LE(end_idx, meta_.dims[0],
+                    paddle::platform::errors::OutOfRange(
+                        "The end row index is out of bound."));
   PADDLE_ENFORCE_LT(
       begin_idx, end_idx,
-      platform::errors::InvalidArgument(
+      paddle::platform::errors::InvalidArgument(
           "The start row index must be less than the end row index."
           "But received the start index = %d, the end index = %d.",
           begin_idx, end_idx));
 
-  if (dims_[0] == 1) {
+  if (meta_.dims[0] == 1) {
     return *this;
   } else {
-    size_t base = numel() / dims_[0];
+    size_t base = numel() / meta_.dims[0];
     Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    dst.type_ = type_;
-    DDim dst_dims = dims_;
+    dst.storage_ = pten::make_intrusive<paddle::experimental::SharedStorage>(
+        storage_->data_shared());
+    dst.meta_.layout = meta_.layout;
+    dst.meta_.dtype = meta_.dtype;
+    DDim dst_dims = meta_.dims;
     dst_dims[0] = end_idx - begin_idx;
     dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    dst.meta_.offset = meta_.offset + begin_idx * base * SizeOf(dtype());
     return dst;
   }
 }
 
 std::vector<Tensor> Tensor::Split(int64_t split_size, int64_t axis) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(dims_.size(), 0,
-                    platform::errors::OutOfRange(
+
+  PADDLE_ENFORCE_GE(meta_.dims.size(), 0,
+                    paddle::platform::errors::OutOfRange(
                         "split expects at least a 1-dimensional tensor"));
+
   PADDLE_ENFORCE_GE(
       split_size, 0,
-      platform::errors::OutOfRange(
+      paddle::platform::errors::OutOfRange(
           "split expects split_size be non-negative, but got split_size is %d",
           split_size));
-  int64_t numel_size = dims_[axis];
+
+  int64_t numel_size = meta_.dims[axis];
 
   int64_t num_splits = 1;
   if (split_size != 0) {
@@ -191,49 +95,33 @@ std::vector<Tensor> Tensor::Split(int64_t split_size, int64_t axis) const {
 
 std::vector<Tensor> Tensor::Chunk(int64_t chunks, int64_t axis) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(dims_.size(), 0,
-                    platform::errors::OutOfRange(
+  PADDLE_ENFORCE_GE(meta_.dims.size(), 0,
+                    paddle::platform::errors::OutOfRange(
                         "split expects at least a 1-dimensional tensor"));
   PADDLE_ENFORCE_GE(
       chunks, 0,
-      platform::errors::OutOfRange(
+      paddle::platform::errors::OutOfRange(
           "chunks expects to be greater than 0, but got chunks is %d", chunks));
 
-  int64_t numel_size = dims_[axis];
+  int64_t numel_size = meta_.dims[axis];
   int64_t split_size = (numel_size + chunks - 1) / chunks;
   return Split(split_size, axis);
 }
 
-Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
+Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
   return *this;
 }
+Tensor& Tensor::ShareInplaceVersionCounterWith(const Tensor& src) {
+  PADDLE_ENFORCE_NOT_NULL(
+      inplace_version_counter_,
+      platform::errors::PreconditionNotMet(
+          "Tensor does not hold inplace_version_counter_."));
 
-const DDim& Tensor::dims() const { return dims_; }
-
-int64_t Tensor::numel() const { return product(dims_); }
-
-void Tensor::ResetHolder(std::shared_ptr<memory::Allocation> holder) {
-  PADDLE_ENFORCE_EQ(
-      offset_, 0,
-      platform::errors::Fatal(
-          "Only the offset is supported to zero when the holder is reset."));
-  if (holder_) {
-    PADDLE_ENFORCE_LE(
-        numel() * SizeOfType(type()) + offset_, holder->size(),
-        paddle::platform::errors::InvalidArgument(
-            "The size of Holder is not enough to store the Tensor."));
-  }
-  holder_ = holder;
-}
-
-void Tensor::ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
-                                 const proto::VarType::Type& type) {
-  type_ = type;
-  ResetHolder(holder);
+  inplace_version_counter_ = src.inplace_version_counter_;
+  return *this;
 }
 
-void Tensor::set_type(const proto::VarType::Type& type) { type_ = type; }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 4b1ae041fc4ca..e86009e9aafea 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -28,6 +28,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
+
+#include "paddle/pten/core/dense_tensor.h"
 
 namespace paddle {
 namespace memory {
@@ -74,108 +77,10 @@ class LoDTensor;
   Variable object but not a pointer.
 */
 
-class TensorInplaceVersion {
- public:
-  explicit TensorInplaceVersion(uint32_t inplace_version = 0)
-      : inplace_version_(inplace_version) {}
-  bool IsUnique() const { return inplace_version_ == 0; }
-  void Bump() { ++inplace_version_; }
-  uint32_t CurrentVersion() const { return inplace_version_; }
-  void SetInplaceVersionToZero() { inplace_version_ = 0; }
-
- private:
-  uint32_t inplace_version_;
-};
-
-class Tensor {
-#ifdef PADDLE_WITH_MKLDNN
-
- public:
-  inline dnnl::memory::format_tag format() const { return format_; }
-
-  inline void set_format(const dnnl::memory::format_tag format) {
-    format_ = format;
-  }
-
- protected:
-  /**
-   * @brief the detail format of memory block which have layout as kMKLDNN
-   *
-   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
-   */
-
-  dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef;
-#endif
-
+class Tensor : public pten::DenseTensor {
  public:
-  template <typename T, size_t D, int MajorType, typename IndexType>
-  friend struct EigenTensor;
-
-  template <typename T, int MajorType, typename IndexType>
-  friend struct EigenMatrix;
-
-  template <typename T, int MajorType, typename IndexType>
-  friend struct EigenVector;
-
- public:
-  Tensor()
-      : type_(proto::VarType::FP32),
-        offset_(0),
-        inplace_version_counter_(std::make_shared<TensorInplaceVersion>(0)) {}
-
-  explicit Tensor(const proto::VarType::Type&);
-
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  T* data();
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  const T* data() const;
-
-  inline bool IsInitialized() const;
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  T* mutable_data(const platform::Place& place, size_t requested_size = 0);
-
-  void* mutable_data(const platform::Place& place, proto::VarType::Type type,
-                     size_t requested_size = 0);
-
-  void* mutable_data(const platform::Place& place, size_t requested_size = 0);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void* mutable_data(const platform::CUDAPlace& place,
-                     proto::VarType::Type type, const gpuStream_t& stream);
-#endif
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims           The dimensions of the memory block.
-   * @param[in] place          The place of the memory block.
-   * @param[in] requested_size The size of the block in bytes.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  T* mutable_data(const DDim& dims, const platform::Place& place,
-                  size_t requested_size = 0);
-
-  /*! Return the dimensions of the memory block. */
-  const DDim& dims() const;
-
-  /*! Return the numel of the memory block. */
-  int64_t numel() const;
-
-  /*! Resize the dimensions of the memory block. */
-  Tensor& Resize(const DDim& dims);
+  using DenseTensor = pten::DenseTensor;
+  using DenseTensor::DenseTensor;
 
   /*! The internal of two tensors share the same memory block. */
   Tensor& ShareDataWith(const Tensor& src);
@@ -183,149 +88,16 @@ class Tensor {
   /*! The internal of two tensors share the same inplace version counter. */
   Tensor& ShareInplaceVersionCounterWith(const Tensor& src);
 
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to slice.
-   *                        The index number begins from 0.
-   */
   Tensor Slice(int64_t begin_idx, int64_t end_idx) const;
 
-  /**
-   * @brief  Return a tensor list of the given tensor.
-   *
-   * @param[in] split_size  The size of tensor to be split along axis.
-   * @param[in] axis        The axis along which to split.
-   */
   std::vector<Tensor> Split(int64_t split_size, int64_t axis) const;
 
-  /**
-   * @brief  Return a tensor list of the given tensor.
-   *
-   * @param[in] chunks   The number of tensor to be split along axis.
-   * @param[in] axis     The axis along which to split.
-   */
   std::vector<Tensor> Chunk(int64_t chunks, int64_t axis) const;
 
-  const platform::Place& place() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        holder_,
-        platform::errors::PreconditionNotMet(
-            "Tensor not initialized yet when Tensor::place() is called."));
-    return holder_->place();
-  }
-
-  proto::VarType::Type type() const {
-    PADDLE_ENFORCE_NOT_NULL(
-        holder_,
-        platform::errors::PreconditionNotMet(
-            "Tensor not initialized yet when Tensor::type() is called."));
-    return type_;
-  }
-
-  /**
-   * [Add method get the saved type of tensor]
-   *
-   * After the introduction of complex number calculations, Ops that support
-   * complex number calculations generally support type promotion, such as
-   * x(float32) + y(complex64) = out(complex64), then the type of the grad
-   * tensor should be dout(complex64), dx(float32), dy (complex64), but the
-   * type of dx to be recognized to be float32 by the grad Op relay on the type
-   * of forward tensor x. But many of our ops have registered InplaceInferer,
-   * covering the tensor memory of x with out, so as to save storage.
-   *
-   * In this case, the dim and type information recorded by x still exist,
-   * but because x becomes an uninitialized tensor, The type of x record cannot
-   * be obtained with x.type(), but the type is still valid here, so we
-   * add saved_type(), This method SHOULD NOT be called by general scenarios.
-   */
-  proto::VarType::Type saved_type() const { return type_; }
-
-  // memory size returns the holding memory size in byte.
-  size_t memory_size() const;
-
-  void check_memory_size() const;
-
-  DataLayout layout() const { return layout_; }
-
-  void set_layout(const DataLayout layout) { layout_ = layout; }
-
-  void clear() {
-    holder_ = nullptr;
-    offset_ = 0;
+  Tensor& Resize(const DDim& dims) {
+    meta_.dims = dims;
+    return *this;
   }
-
-  void ShareBufferWith(const Tensor& tensor) {
-    holder_ = tensor.holder_;
-    offset_ = tensor.offset_;
-    // NOTE(chenfeiyu): when sharing buffer, by definition only holder
-    // to the memory allocation and offset should be shared. Shape,
-    // data type, layout, and other metadata associated with a Tensor
-    // should not be copied.
-  }
-
-  void ShareDataTypeWith(const Tensor& tensor) { type_ = tensor.type_; }
-
-  bool IsSharedBufferWith(const Tensor& src) const {
-    return holder_ && holder_ == src.Holder();
-  }
-
-  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
-  size_t offset() const { return offset_; }
-
-  std::shared_ptr<memory::Allocation> MoveMemoryHolder() {
-    return std::move(holder_);
-  }
-
-  void ResetHolder(std::shared_ptr<memory::Allocation> holder);
-
-  void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
-                           const proto::VarType::Type& type);
-
-  void set_type(const proto::VarType::Type& type);
-
-  TensorInplaceVersion& InplaceVersionCounter() {
-    return *inplace_version_counter_;
-  }
-
- private:
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<memory::Allocation> holder_;
-  proto::VarType::Type type_;
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-
-  DDim dims_;
-
-  /**
-   * @brief the layout of memory block, default is NHWC.
-   *
-   * @note the memory allocation order, describe how weight/data is stored
-   *       For example, in 4-D Tensor(rank=4), there are three commonly
-   *       used layout. They are
-   *            NCHW, NHWC, CHWN.
-   *       N,C,H,W for respectively the batch size, the number of
-   *       feature maps, the height.
-   */
-  // Fix me: here just change the default layout to kNCHW
-  // it doesn't fix the real issue, i.e. feeder should set up tensor layout
-  // according to actual input data
-  DataLayout layout_ = DataLayout::kNCHW;
-
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really begins.
-   */
-  size_t offset_;
-  std::shared_ptr<TensorInplaceVersion> inplace_version_counter_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 986551b935e88..98ad9a629b5ab 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -20,55 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-template <typename T>
-inline const T* Tensor::data() const {
-  check_memory_size();
-  bool valid =
-      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE_EQ(
-      valid, true,
-      platform::errors::InvalidArgument(
-          "Tensor holds the wrong type, it holds %s, but desires to be %s.",
-          DataTypeToString(type_),
-          DataTypeToString(DataTypeTrait<T>::DataType())));
-
-  return reinterpret_cast<const T*>(
-      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-}
-
-inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
-
-template <typename T>
-inline T* Tensor::data() {
-  check_memory_size();
-  bool valid =
-      std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE_EQ(
-      valid, true,
-      platform::errors::InvalidArgument(
-          "Tensor holds the wrong type, it holds %s, but desires to be %s",
-          DataTypeToString(type_),
-          DataTypeToString(DataTypeTrait<T>::DataType())));
-
-  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                              offset_);
-}
-
-template <typename T>
-inline T* Tensor::mutable_data(const DDim& dims, const platform::Place& place,
-                               size_t requested_size) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  Resize(dims);
-  return mutable_data<T>(place, requested_size);
-}
-
-template <typename T>
-inline T* Tensor::mutable_data(const platform::Place& place,
-                               size_t requested_size) {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(
-      mutable_data(place, DataTypeTrait<T>::DataType(), requested_size));
-}
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   int rank = src.dims().size();
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 71ff50c92ca59..a58f4a6b5f4c1 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -45,7 +45,6 @@ TEST(Tensor, DataAssert) {
   } catch (platform::EnforceNotMet& err) {
     caught = true;
     std::string ex_msg = err.what();
-    EXPECT_TRUE(ex_msg.find("holder_ should not be null") != std::string::npos);
     EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
                             "Tensor::mutable_data firstly.") !=
                 std::string::npos);
@@ -189,8 +188,6 @@ TEST(Tensor, ShareDataWith) {
     } catch (paddle::platform::EnforceNotMet& err) {
       caught = true;
       std::string ex_msg = err.what();
-      EXPECT_TRUE(ex_msg.find("holder_ should not be null") !=
-                  std::string::npos);
       EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
                               "Tensor::mutable_data firstly.") !=
                   std::string::npos);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index d655e3e8e53e5..f2323f6e2c6ee 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -23,6 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/profiler.h"
+
+#include "paddle/pten/core/dense_tensor.h"
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "dnnl_debug.h"  // NOLINT
 #endif
@@ -30,22 +33,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void TensorCopy(const Tensor& src, const platform::Place& dst_place,
-                const platform::DeviceContext& ctx, Tensor* dst) {
+template <typename TENSOR>
+void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
+                    const platform::DeviceContext& ctx, TENSOR* dst) {
   if (&src == dst) {
     auto src_copy = src;
-    TensorCopy(src_copy, dst_place, ctx, dst);
+    TensorCopyImpl(src_copy, dst_place, ctx, dst);
     return;
   }
 
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
   src.check_memory_size();
-
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
   auto src_place = src.place();
-  auto src_ptr = src.data<void>();
+  auto src_ptr = src.data();
 #ifdef PADDLE_WITH_MKLDNN
   dst->set_format(src.format());
   // oneDNN tensors due to padding may be of bigger size
@@ -357,10 +360,41 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         "Copying from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_MLU
+  else if (platform::is_mlu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_mlu_place = BOOST_GET_CONST(platform::MLUPlace, src_place);
+    auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
+    auto stream =
+        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
+    memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_mlu_place(dst_place)) {
+    auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
+    auto dst_mlu_place = BOOST_GET_CONST(platform::MLUPlace, dst_place);
+    auto stream =
+        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
+    memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
+  }
+  else if (platform::is_mlu_place(src_place) &&  // NOLINT
+           platform::is_mlu_place(dst_place)) {
+    auto src_mlu_place = BOOST_GET_CONST(platform::MLUPlace, src_place);
+    auto dst_mlu_place = BOOST_GET_CONST(platform::MLUPlace, dst_place);
+    auto stream =
+        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
+    memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copying from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 }
 
-void TensorCopy(const Tensor& src, const platform::Place& dst_place,
-                Tensor* dst) {
+template <typename TENSOR>
+void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
+                    TENSOR* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
   if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place)) {
@@ -368,7 +402,24 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   } else {
     dev_ctx = pool.Get(src.place());
   }
-  TensorCopy(src, dst_place, *dev_ctx, dst);
+  TensorCopyImpl(src, dst_place, *dev_ctx, dst);
+}
+
+void TensorCopy(const Tensor& src, const platform::Place& dst_place,
+                Tensor* dst) {
+  TensorCopyImpl<Tensor>(src, dst_place, dst);
+}
+void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
+                pten::DenseTensor* dst) {
+  TensorCopyImpl<pten::DenseTensor>(src, dst_place, dst);
+}
+void TensorCopy(const Tensor& src, const platform::Place& dst_place,
+                const platform::DeviceContext& ctx, Tensor* dst) {
+  TensorCopyImpl<Tensor>(src, dst_place, ctx, dst);
+}
+void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
+                const platform::DeviceContext& ctx, pten::DenseTensor* dst) {
+  TensorCopyImpl<pten::DenseTensor>(src, dst_place, ctx, dst);
 }
 
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
@@ -388,8 +439,9 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   dst->set_format(src.format());
 #endif
   auto src_place = src.place();
-  auto src_ptr = src.data<void>();
+  auto src_ptr = src.data();
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data from " << src_place << " to "
@@ -526,6 +578,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_MLU
+  else if (platform::is_mlu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::MLUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_mlu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_mlu_place(src_place) &&  // NOLINT
+           platform::is_mlu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::MLUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 }
 
 template <typename Predicate, typename DevCtx>
@@ -912,7 +993,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
   {  // the 3rd field, tensor data
     uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
 
-    auto* data_ptr = tensor.data<void>();
+    auto* data_ptr = tensor.data();
     PADDLE_ENFORCE_LT(size, (std::numeric_limits<std::streamsize>::max)(),
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 575e2171652a2..46eba6a1e41bb 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -34,6 +34,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/device_context.h"
 #endif
 
+#include "paddle/pten/core/dense_tensor.h"
+
 namespace paddle {
 namespace framework {
 
@@ -75,6 +77,8 @@ class Tensor;
 
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
+                const platform::DeviceContext& ctx, pten::DenseTensor* dst);
 
 // NOTE(zcd): If the src.place() and dst_place are two different GPU,
 // the copy operation is carried out on the dst_place's stream. This is
@@ -85,6 +89,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 // not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
+void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
+                pten::DenseTensor* dst);
 
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 41fe9fbbc0396..0a24efd003bcf 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_desc.h"
 
 #include "glog/logging.h"
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -116,6 +117,10 @@ proto::VarType::Type VarDesc::GetDataType() const {
   return tensor_desc().data_type();
 }
 
+size_t VarDesc::ElementSize() const {
+  return framework::SizeOfType(GetDataType());
+}
+
 std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
   std::vector<proto::VarType::TensorDesc> descs = tensor_descs();
   std::vector<proto::VarType::Type> res;
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index a6f56ad445834..a20ef58f9c95f 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -69,6 +69,12 @@ class VarDesc {
 
   explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
 
+  // Explicitly implement the copy constructor for auto parallel
+  VarDesc(const VarDesc &other)
+      : desc_(other.desc_),
+        attrs_(other.attrs_),
+        original_id_(other.original_id_) {}
+
   proto::VarDesc *Proto() { return &desc_; }
 
   const proto::VarDesc *Proto() const { return &desc_; }
@@ -96,6 +102,8 @@ class VarDesc {
 
   proto::VarType::Type GetDataType() const;
 
+  size_t ElementSize() const;
+
   std::vector<proto::VarType::Type> GetDataTypes() const;
 
   void SetLoDLevel(int32_t lod_level);
@@ -151,16 +159,10 @@ class VarDesc {
 
   Attribute GetAttr(const std::string &name) const;
 
-  // This thread-safe implementation seems to be redudent since the neural
-  // networks are usually constructed in a single thread.
-  static uint64_t GenerateId() {
-    static std::atomic<std::uint64_t> uid{0};
-    return ++uid;
-  }
-
-  // Note: the identity only used as a key for referring to its
-  // distributed attribute now.
+  // The Id() and OriginalId() are only used for auto parallel.
   uint64_t Id() const { return id_; }
+  uint64_t OriginalId() const { return original_id_; }
+  void SetOriginalId(uint64_t original_id) { original_id_ = original_id; }
 
  private:
   const proto::VarType::TensorDesc &tensor_desc() const;
@@ -168,9 +170,23 @@ class VarDesc {
   proto::VarType::TensorDesc *mutable_tensor_desc();
   std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();
 
+  // This thread-safe implementation seems to be redudent since the neural
+  // networks are usually constructed in a single thread.
+  static uint64_t GenerateId() {
+    static std::atomic<std::uint64_t> uid{0};
+    return ++uid;
+  }
+
   proto::VarDesc desc_;
   AttributeMap attrs_;
+
+  // Note: the id_ is unique for all VarDesc (only for auto parallel).
   uint64_t id_ = GenerateId();
+  // Note: the orignal_id_ is used for referring to the original VarDesc
+  // that the current VarDesc is built from (only for auto parallel).
+  // The default original_id_ is same as the id_, which means the
+  // current VarDesc is not built from the other one.
+  uint64_t original_id_ = id_;
 };
 
 bool operator==(const VarDesc &left, const VarDesc &right);
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index f8ad990a668ce..2fa48150903ad 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -72,7 +72,7 @@ class Variable {
  private:
   // This method hides type T, so it doesn't appear as a template parameter of
   // Variable.
-  framework::TensorInplaceVersion* InplaceVersionCounter();
+  pten::TensorInplaceVersion* InplaceVersionCounter();
 
  public:
   void SetInplaceVersionToZero();
@@ -114,8 +114,8 @@ class Variable {
   std::shared_ptr<Placeholder> holder_;
 };
 
-inline framework::TensorInplaceVersion* Variable::InplaceVersionCounter() {
-  framework::TensorInplaceVersion* version_counter_ptr(nullptr);
+inline pten::TensorInplaceVersion* Variable::InplaceVersionCounter() {
+  pten::TensorInplaceVersion* version_counter_ptr(nullptr);
   if (IsType<framework::LoDTensor>()) {
     version_counter_ptr =
         &GetMutable<framework::LoDTensor>()->InplaceVersionCounter();
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 31da214fbc39a..78855cc5c9e2e 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -60,7 +60,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
       platform::errors::Unimplemented(
           "Imperative mode does not support multi-CPU training yet."));
 
-  const void *src_ptr = src.data<void>();
+  const void *src_ptr = src.data();
   dst->Resize(src.dims());
   auto *dst_ptr = dst->mutable_data(src.place(), src.type());
   auto nccl_dtype = platform::ToNCCLDataType(src.type());
@@ -129,7 +129,7 @@ static void AllReduce(const framework::SelectedRows &src,
   auto feature_size = framework::product(dims) / dims[0];
   dst_tensor->Resize(dims);
   auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype);
-  const auto *src_tensor_ptr = src_tensor.data<void>();
+  const auto *src_tensor_ptr = src_tensor.data();
 
   auto sizeof_dtype = framework::SizeOfType(dtype);
   int64_t row_offset = 0;
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index ee1c4d1be5109..9d37792653664 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -174,7 +174,7 @@ void BasicEngine::PrepareGradAccumulators(
       if (!var) continue;
 
       bool find_grad_node_of_var = false;
-      if (var->HasGradNode()) {
+      if (grad_pending_nodes.size()) {
         // Because Inplace op overwrites the grad_node of the input grad_var. So
         // only the information of grad_pending_node can be used to find the
         // grad_node of grad_var.
@@ -240,7 +240,7 @@ void BasicEngine::PrepareGradAccumulators(
         }
       }
 
-      if (!var->HasGradNode() || !find_grad_node_of_var) {
+      if (!grad_pending_nodes.size() || !find_grad_node_of_var) {
         auto& accumulator = accumulators_[var.get()];
         if (!accumulator) {
           if (FLAGS_sort_sum_gradient) {
@@ -438,15 +438,15 @@ void BasicEngine::Execute() {
             continue;
           }
 
+          const auto& grad_pending_nodes = shared_cur_node->GradPendingNodes();
           std::unordered_map<VariableWrapper*,
                              std::unique_ptr<GradientAccumulator>>::iterator
               iter;
           bool flag_find_grad = false;
-          if (var->HasGradNode()) {
+          if (grad_pending_nodes.size()) {
             VLOG(10) << "Find gradient of var (" << var->Name()
                      << ") with grad_node.";
-            for (auto& grad_pending_node :
-                 shared_cur_node->GradPendingNodes()) {
+            for (auto& grad_pending_node : grad_pending_nodes) {
               const auto& iter_grad_node =
                   accumulators_with_grad_node_.find(grad_pending_node);
               if (iter_grad_node != accumulators_with_grad_node_.end()) {
@@ -458,10 +458,11 @@ void BasicEngine::Execute() {
               }
             }
             if (!flag_find_grad) {
-              VLOG(6) << "Cannot find gradient of variable " << var->Name();
+              VLOG(6) << "Cannot find gradient of variable " << var->Name()
+                      << " in accumulators_with_grad_node_";
             }
           }
-          if (!var->HasGradNode() || !flag_find_grad) {
+          if (!grad_pending_nodes.size() || !flag_find_grad) {
             VLOG(10) << "Find gradient of var (" << var->Name()
                      << ") with no grad_node.";
             iter = accumulators_.find(var.get());
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 6569929d6f5d7..2072c41673aaf 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -39,7 +39,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
       platform::errors::Unimplemented(
           "Dynamic graph mode does not support multi-CPU training yet."));
 
-  const void *src_ptr = src.data<void>();
+  const void *src_ptr = src.data();
   dst->Resize(src.dims());
   auto *dst_ptr = dst->mutable_data(src.place(), src.type());
   auto bkcl_dtype = platform::ToBKCLDataType(src.type());
@@ -158,7 +158,7 @@ void BKCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
       platform::BKCLCommContext::Instance().Get(ring_id, place);
   XPUStream stream = comm->stream();
 
-  void *src_ptr = src_tensor->data<void>();
+  void *src_ptr = src_tensor->data();
   auto data_type = platform::ToBKCLDataType(src_tensor->type());
 
   PADDLE_ENFORCE_EQ(bkcl_broadcast(comm->comm(), src_ptr, src_ptr,
diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc
index 55c52ae6c11de..818b2f424b6af 100644
--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -42,7 +42,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
       platform::errors::Unimplemented(
           "Imperative mode does not support multi-CPU training yet."));
 
-  void *src_ptr = const_cast<void *>(src.data<void>());
+  void *src_ptr = const_cast<void *>(src.data());
   dst->Resize(src.dims());
   void *dst_ptr = dst->mutable_data(src.place(), src.type());
   HcclDataType hccl_dtype = platform::ToHCCLDataType(src.type());
@@ -168,7 +168,7 @@ void HCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
     aclrtStream stream = comm->stream();
 
     void *src_ptr =
-        reinterpret_cast<void *>(const_cast<void *>(src_tensor->data<void>()));
+        reinterpret_cast<void *>(const_cast<void *>(src_tensor->data()));
     auto hccl_dtype = platform::ToHCCLDataType(src_tensor->type());
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
         src_ptr, src_tensor->numel(), hccl_dtype, 0, comm->comm(),
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 892c864027d11..199d62bff1f20 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -25,6 +25,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -37,7 +38,6 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/pten/include/core.h"
-
 namespace paddle {
 namespace framework {
 class Variable;
@@ -212,6 +212,8 @@ class VarBase {
 
   framework::proto::VarType::Type DataType() const { return var_->DataType(); }
 
+  size_t ElementSize() const { return framework::SizeOfType(var_->DataType()); }
+
   void SetForwardDataType(framework::proto::VarType::Type data_type) {
     var_->SetForwardDataType(data_type);
   }
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 15146f6c1204e..1b50c515635d2 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -143,7 +143,7 @@ void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
       platform::NCCLCommContext::Instance().Get(ring_id, place);
   gpuStream_t stream = comm->stream();
 
-  void *src_ptr = src_tensor->data<void>();
+  void *src_ptr = src_tensor->data();
   auto nccl_dtype = platform::ToNCCLDataType(src_tensor->type());
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
       src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream));
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index c4092a33aa332..29cd24a1e7793 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -29,6 +29,7 @@
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(run_pten_kernel);
 DECLARE_bool(benchmark);
+DECLARE_bool(run_kp_kernel);
 
 namespace paddle {
 namespace imperative {
@@ -216,6 +217,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_MLU
+  if (kernel_iter == kernels.end() &&
+      is_mlu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing MLU kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
   // case
@@ -480,8 +491,7 @@ static void PreparedOpRunImpl(
 
   DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
                                                     &default_attrs, op.Type());
-  static_cast<const framework::OperatorWithKernel&>(op).InferShape(
-      &infer_shape_ctx);
+  op.Info().infer_shape_(&infer_shape_ctx);
 
   func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
                                         attrs, default_attrs));
@@ -526,8 +536,7 @@ static void PreparedOpRunPtImpl(
     const framework::AttributeMap& default_attrs) {
   DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
                                                     &default_attrs, op.Type());
-  static_cast<const framework::OperatorWithKernel&>(op).InferShape(
-      &infer_shape_ctx);
+  op.Info().infer_shape_(&infer_shape_ctx);
 
   BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
                                          outs, attrs, default_attrs, dev_ctx,
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 9014871229b39..beddbd5d12008 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -48,9 +48,19 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   } else if (platform::is_cpu_place(tensor->place())) {
     VLOG(4) << "before div 2" << *tensor;
     VLOG(4) << "NDiv for cpu devices : rank = " << nranks;
-    framework::VisitDataTypeSmall(
+#ifdef PADDLE_WITH_HIP
+    if (dtype_ == paddle::framework::proto::VarType_Type_BF16) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Unsupport BF16 in DataParallel for now"));
+    }
+    framework::VisitDataTypeForHIP(
         dtype_, DivNRanksForAllReduce<platform::CPUDeviceContext>(
                     tensor, nranks, context));
+#else
+    framework::VisitDataType(dtype_,
+                             DivNRanksForAllReduce<platform::CPUDeviceContext>(
+                                 tensor, nranks, context));
+#endif
     VLOG(4) << "after div 2" << *tensor;
   } else if (platform::is_xpu_place(tensor->place())) {
 #ifdef PADDLE_WITH_XPU_BKCL
diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu
index ca233292b3470..05453a61b7e39 100644
--- a/paddle/fluid/imperative/reducer.cu
+++ b/paddle/fluid/imperative/reducer.cu
@@ -20,9 +20,19 @@ namespace imperative {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void Group::DivNRanks(framework::Tensor *tensor, int64_t nranks,
                       const platform::DeviceContext &context) {
-  framework::VisitDataTypeSmall(
+#ifdef PADDLE_WITH_HIP
+  if (dtype_ == paddle::framework::proto::VarType_Type_BF16) {
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Unsupport BF16 in DataParallel for now"));
+  }
+  framework::VisitDataTypeForHIP(
       dtype_, DivNRanksForAllReduce<platform::CUDADeviceContext>(tensor, nranks,
                                                                  context));
+#else
+  framework::VisitDataType(
+      dtype_, DivNRanksForAllReduce<platform::CUDADeviceContext>(tensor, nranks,
+                                                                 context));
+#endif
 }
 #endif
 
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index cb4ab2e79cb99..0a7df9953ad45 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -176,8 +176,8 @@ static bool IsEqualVar(const framework::Variable& var1,
     return false;
   }
 
-  auto* t1_p = t1.data<void>();
-  auto* t2_p = t2.data<void>();
+  auto* t1_p = t1.data();
+  auto* t2_p = t2.data();
   return std::memcmp(t1_p, t2_p,
                      t1.numel() * framework::SizeOfType(t1.type())) == 0;
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 9993bb37d5140..8bb08b6fdaf2a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -56,8 +56,17 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   // Because there exists the case that new parameter variables are not added to
   // the program in the analysis pass.
   bool reserve_cpu_weights = false;
-  if (argument->tensorrt_allow_build_at_runtime_valid() &&
-      argument->tensorrt_allow_build_at_runtime()) {
+  bool with_dynamic_shape = false;
+  if (argument->Has("max_input_shape") && argument->Has("min_input_shape") &&
+      argument->Has("optim_input_shape")) {
+    with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                          argument->min_input_shape().size() > 0 &&
+                          argument->optim_input_shape().size() > 0);
+  }
+  with_dynamic_shape =
+      with_dynamic_shape || (argument->Has("tensorrt_tuned_dynamic_shape") &&
+                             argument->tensorrt_tuned_dynamic_shape());
+  if (with_dynamic_shape) {
     reserve_cpu_weights = true;
   }
   for (auto &var_name : all_vars) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7bf67ce564285..c45bc8a1215d9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -24,6 +24,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid//platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
@@ -410,7 +411,10 @@ void AnalysisPredictor::MkldnnPreSet(
 void AnalysisPredictor::MkldnnPostReset() {
 #ifdef PADDLE_WITH_MKLDNN
   // In cache clearing mode.
-  if (config_.mkldnn_cache_capacity_ > 0) {
+  if (config_.mkldnn_cache_capacity_ > 0 &&
+      static_cast<platform::MKLDNNDeviceContext *>(
+          (&platform::DeviceContextPool::Instance())->Get(platform::CPUPlace()))
+              ->GetCachedObjectsNumber() > 0) {
     if (VLOG_IS_ON(2)) {
       auto shape_blob_size = static_cast<platform::MKLDNNDeviceContext *>(
                                  (&platform::DeviceContextPool::Instance())
@@ -585,9 +589,6 @@ void AnalysisPredictor::PrepareArgument() {
   if (!config_.model_dir().empty()) {
     argument_.SetModelDir(config_.model_dir());
   } else {
-    PADDLE_ENFORCE_EQ(config_.params_file().empty(), false,
-                      platform::errors::PreconditionNotMet(
-                          "Either model_dir or param_file should be set."));
     PADDLE_ENFORCE_EQ(config_.prog_file().empty(), false,
                       platform::errors::PreconditionNotMet(
                           "Either model_dir or prog_file should be set."));
@@ -1055,6 +1056,20 @@ bool AnalysisPredictor::ZeroCopyRun() {
   return true;
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
+  if (stream != nullptr) {
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
+    auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
+        pool.Get(gpu_place));
+    dev_ctx->SetThreadLocalStream(stream);
+  }
+  return ZeroCopyRun();
+}
+#endif
+
 void AnalysisPredictor::CollectShapeRangeInfo() {
   // if use gpu, sync first.
   if (config_.use_gpu()) {
@@ -1132,7 +1147,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
   std::string filename;
   if (!config_.model_dir().empty()) {
     filename = config_.model_dir() + "/__model__";
-  } else if (!config_.prog_file().empty() && !config_.params_file().empty()) {
+  } else if (!config_.prog_file().empty()) {
     // All parameters are saved in a single file.
     // The file names should be consistent with that used
     // in Python API `fluid.io.save_inference_model`.
@@ -1344,6 +1359,7 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
   x->Init(scope_, inference_program_);
+  x->executor_->ResetTrtOps(++x->clone_num_);
   return std::unique_ptr<PaddlePredictor>(x);
 }
 
@@ -1579,4 +1595,25 @@ Predictor *PredictorPool::Retrive(size_t idx) {
   return preds_[idx - 1].get();
 }
 }  // namespace services
+
+namespace experimental {
+
+// Note: Can only be used under thread_local semantics.
+bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
+                                          cudaStream_t stream) {
+#ifdef PADDLE_WITH_CUDA
+  auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
+  return pred->ExpRunWithExternalStream(stream);
+#endif
+  return false;
+}
+bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
+                                          hipStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
+  return pred->ExpRunWithExternalStream(stream);
+#endif
+  return false;
+}
+}  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 9c36051757527..a8e56101d37da 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/string/printf.h"
 #ifdef PADDLE_WITH_TESTING
@@ -172,6 +173,11 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool ZeroCopyRun() override;
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // Note: Can only be used under thread_local semantics.
+  bool ExpRunWithExternalStream(const gpuStream_t stream);
+#endif
+
   ///
   /// \brief Create feed fetch variables
   ///
@@ -429,6 +435,7 @@ class AnalysisPredictor : public PaddlePredictor {
   bool status_is_cloned_{false};
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
+  int clone_num_{1};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 89aec34110b85..124279d246093 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -37,13 +37,13 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
   PaddleTensor pt;
 
   if (t->type() == framework::proto::VarType::INT64) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
+    pt.data.Reset(t->data(), t->numel() * sizeof(int64_t));
     pt.dtype = PaddleDType::INT64;
   } else if (t->type() == framework::proto::VarType::FP32) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
+    pt.data.Reset(t->data(), t->numel() * sizeof(float));
     pt.dtype = PaddleDType::FLOAT32;
   } else if (t->type() == framework::proto::VarType::INT32) {
-    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int32_t));
+    pt.data.Reset(t->data(), t->numel() * sizeof(int32_t));
     pt.dtype = PaddleDType::INT32;
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 1fdc5cd730e53..d03840ada36bc 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -54,8 +54,10 @@ if (WIN32)
     if (WITH_MKL)
       set(FLAG_OPENMP "/openmp")
     endif()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4244 /wd4251 /wd4267 /wd4305")
     set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
     set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4251 /wd4267 /wd4305")
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
     safe_set_static_flag()
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index aa29b779e471b..ef9d03d1dcbaf 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -124,7 +124,8 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs(
       } else if (op->Type() == "relu") {
         is_unsigned = true;
       } else if (op->Type() == "transpose2" || op->Type() == "reshape2" ||
-                 op->Type() == "pool2d") {
+                 op->Type() == "pool2d" || op->Type() == "nearest_interp" ||
+                 op->Type() == "nearest_interp_v2") {
         auto input_var_name = op->Input("X")[0];
         PADDLE_ENFORCE_NE(scales_.find(input_var_name), scales_.end(),
                           platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index 6642a2c030b26..d4fa78518e149 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -107,6 +107,18 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["fusion_lstm"]["ReorderedC0"] = ScaleAlgo::NONE;
   rules_["fusion_lstm"]["CheckedCell"] = ScaleAlgo::NONE;
   rules_["fusion_lstm"]["Hidden"] = ScaleAlgo::KL;
+
+  rules_["nearest_interp"]["X"] = ScaleAlgo::KL;
+  rules_["nearest_interp"]["OutSize"] = ScaleAlgo::NONE;
+  rules_["nearest_interp"]["SizeTensor"] = ScaleAlgo::NONE;
+  rules_["nearest_interp"]["Scale"] = ScaleAlgo::NONE;
+  rules_["nearest_interp"]["Out"] = ScaleAlgo::NONE;
+
+  rules_["nearest_interp_v2"]["X"] = ScaleAlgo::KL;
+  rules_["nearest_interp_v2"]["OutSize"] = ScaleAlgo::NONE;
+  rules_["nearest_interp_v2"]["SizeTensor"] = ScaleAlgo::NONE;
+  rules_["nearest_interp_v2"]["Scale"] = ScaleAlgo::NONE;
+  rules_["nearest_interp_v2"]["Out"] = ScaleAlgo::NONE;
 }
 
 ScaleAlgo MkldnnQuantizerConfig::scale_algo(
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 35b90bfa54f73..b2b9f2e407478 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -41,11 +41,27 @@ limitations under the License. */
 /// \since 2.0.0-beta
 ///
 
+// forward declation
+using cudaStream_t = struct CUstream_st*;
+using hipStream_t = struct ihipStream_t*;
+
 namespace paddle_infer {
 
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 
+class Predictor;
+namespace experimental {
+class PD_INFER_DECL InternalUtils {
+ public:
+  // Note: Can only be used under thread_local semantics.
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    cudaStream_t stream);
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    hipStream_t stream);
+};
+}  // namespace experimental
+
 ///
 /// \class Predictor
 ///
@@ -150,6 +166,7 @@ class PD_INFER_DECL Predictor {
 
  private:
   std::unique_ptr<paddle::PaddlePredictor> predictor_;
+  friend class paddle_infer::experimental::InternalUtils;
 };
 
 ///
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index b1ad2f6c87cc6..e342190fda1ac 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -459,12 +459,10 @@ __pd_give PD_OneDimArrayCstr* PD_ConfigAllPasses(
   std::vector<std::string> passes = config->pass_builder()->AllPasses();
   return paddle_infer::CvtVecToOneDimArrayCstr(passes);
 }
-const char* PD_ConfigSummary(__pd_keep PD_Config* pd_config) {
+__pd_give PD_Cstr* PD_ConfigSummary(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   auto sum_str = config->Summary();
-  char* c = reinterpret_cast<char*>(malloc(sum_str.length() + 1));
-  snprintf(c, sum_str.length() + 1, "%s", sum_str.c_str());
-  return c;
+  return paddle_infer::CvtStrToCstr(sum_str);
 }
 
 }  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index e8ab9357dc95d..c314aca918f14 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -705,7 +705,7 @@ PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* PD_ConfigAllPasses(
 ///
 /// \return Return config info.
 ///
-PADDLE_CAPI_EXPORT extern const char* PD_ConfigSummary(
+PADDLE_CAPI_EXPORT extern __pd_give PD_Cstr* PD_ConfigSummary(
     __pd_keep PD_Config* pd_config);
 
 #ifdef __cplusplus
diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h
index a5da2913a9b20..62c54616535cf 100644
--- a/paddle/fluid/inference/capi_exp/pd_types.h
+++ b/paddle/fluid/inference/capi_exp/pd_types.h
@@ -34,6 +34,11 @@ typedef struct PD_OneDimArrayCstr {
   char** data;
 } PD_OneDimArrayCstr;  // std::vector<std::string>
 
+typedef struct PD_Cstr {
+  size_t size;
+  char* data;
+} PD_Cstr;  // std::string
+
 typedef struct PD_TwoDimArraySize {
   size_t size;
   PD_OneDimArraySize** data;
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
index 94362b8784bb3..efca350fbaf49 100644
--- a/paddle/fluid/inference/capi_exp/pd_utils.cc
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -78,6 +78,17 @@ void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) {
     delete array;
   }
 }
+
+void PD_CstrDestroy(__pd_take PD_Cstr* cstr) {
+  if (cstr != NULL) {
+    if (cstr->size != 0) {
+      cstr->size = 0;
+      delete[] cstr->data;
+      cstr->data = NULL;
+    }
+    delete cstr;
+  }
+}
 namespace paddle_infer {
 
 __pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
@@ -101,6 +112,18 @@ std::vector<std::string> CvtOneDimArrayToVecCstr(
   return vec;
 }
 
+__pd_give PD_Cstr* CvtStrToCstr(const std::string& str) {
+  PD_Cstr* cstr = new PD_Cstr;
+  if (str.empty()) {
+    cstr->size = 0;
+    cstr->data = NULL;
+  } else {
+    cstr->size = str.length() + 1;
+    cstr->data = new char[str.length() + 1];
+    memcpy(cstr->data, str.c_str(), str.length() + 1);
+  }
+  return cstr;
+}
 }  // namespace paddle_infer
 
 #define DESTROY_TWO_DIM_ARRAY(type)                                           \
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h
index 68e519d4bb5e9..8743c58db76c9 100644
--- a/paddle/fluid/inference/capi_exp/pd_utils.h
+++ b/paddle/fluid/inference/capi_exp/pd_utils.h
@@ -65,6 +65,15 @@ PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy(
 PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy(
     __pd_take PD_TwoDimArraySize* array);
 
+///
+/// \brief Destroy the PD_Cstr object pointed to by the pointer.
+/// NOTE: if input string is empty, the return PD_Cstr's size is
+/// 0 and data is NULL.
+///
+/// \param[in] cstr pointer to the PD_Cstr object.
+///
+PADDLE_CAPI_EXPORT extern void PD_CstrDestroy(__pd_take PD_Cstr* cstr);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h
index fbae512ecd855..95b16dbd59943 100644
--- a/paddle/fluid/inference/capi_exp/utils_internal.h
+++ b/paddle/fluid/inference/capi_exp/utils_internal.h
@@ -114,6 +114,14 @@ __pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize(
 std::vector<std::vector<size_t>> CvtTwoDimArrayToVecSize(
     __pd_keep const PD_TwoDimArraySize* array);
 
+///
+/// \brief Convert the 'std::string' object to a 'PD_Cstr' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_Cstr* CvtStrToCstr(const std::string& vec);
+
 ///
 /// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType'
 /// object.
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index a95bb6bef6ee4..def26913b0a1c 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -833,7 +833,7 @@ func (config *Config) AllPasses() []string {
 ///
 func (config *Config) Summary() string {
 	cSummary := C.PD_ConfigSummary(config.c)
-	summary := C.GoString(cSummary)
-	C.free(unsafe.Pointer(cSummary))
+	summary := C.GoString(cSummary.data)
+	C.PD_CstrDestroy(cSummary)
 	return summary
 }
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index cbc947ea6436a..b1e0eb5ef16ab 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -210,7 +210,7 @@ void TensorCopyAsync(paddle::lite_api::Tensor* dst,
   const size_t bytes =
       static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
   dst->Resize(framework::vectorize(src.dims()));
-  const void* src_data = src.data<void>();
+  const void* src_data = src.data();
   void* dst_data{nullptr};
   dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
                                   GetLiteTargetType(src.place()));
@@ -242,7 +242,7 @@ void TensorCopyAsync(framework::LoDTensor* dst,
 template <>
 void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
   dst->Resize(framework::vectorize(src->dims()));
-  dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
+  dst->ShareExternalMemory(src->data(), src->memory_size(),
                            GetLiteTargetType(src->place()));
   dst->SetPrecision(GetLitePrecisionType(src->type()));
   paddle::lite::LoD lite_lod;
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index cb815e00c4430..2a35f497ed07f 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -42,7 +42,10 @@ void TensorRTEngine::InitNetwork() {
   }
 
   infer_builder_config_.reset(infer_builder_->createBuilderConfig());
-  optim_profile_ = infer_builder_->createOptimizationProfile();
+  // optim_profile_ = infer_builder_->createOptimizationProfile();
+  optim_profiles_.resize(max_profile_num_);
+  for (int i = 0; i < max_profile_num_; i++)
+    optim_profiles_[i] = infer_builder_->createOptimizationProfile();
 }
 
 void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
@@ -199,35 +202,38 @@ void TensorRTEngine::FreezeNetwork() {
   if (with_dynamic_shape_) {
 #if IS_TRT_VERSION_GE(6000)
     LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
-    for (auto &input : min_input_shape_) {
+    for (int i = 0; i < max_profile_num_; i++) {
+      for (auto &input : min_input_shape_) {
 #if IS_TRT_VERSION_LT(7000)
-      // trt6 will check all_of input > 0
-      if (!(std::all_of(input.second.begin(), input.second.end(),
-                        [](int x) { return x > 0; }) &&
-            std::all_of(max_input_shape_[input.first].begin(),
-                        max_input_shape_[input.first].end(),
-                        [](int x) { return x > 0; }) &&
-            std::all_of(optim_input_shape_[input.first].begin(),
-                        optim_input_shape_[input.first].end(),
-                        [](int x) { return x > 0; }))) {
-        continue;
-      }
+        // trt6 will check all_of input > 0
+        if (!(std::all_of(input.second.begin(), input.second.end(),
+                          [](int x) { return x > 0; }) &&
+              std::all_of(max_input_shape_[input.first].begin(),
+                          max_input_shape_[input.first].end(),
+                          [](int x) { return x > 0; }) &&
+              std::all_of(optim_input_shape_[input.first].begin(),
+                          optim_input_shape_[input.first].end(),
+                          [](int x) { return x > 0; }))) {
+          continue;
+        }
 #endif
-      VLOG(4) << "TRT dynamic_shape set " << input.first
-              << " min: " << Vec2Str(input.second)
-              << ", max: " << Vec2Str(max_input_shape_[input.first])
-              << ", opt: " << Vec2Str(optim_input_shape_[input.first]);
-      optim_profile_->setDimensions(
-          input.first.c_str(), nvinfer1::OptProfileSelector::kMIN,
-          Vec2TRT_Dims(input.second, input.first, true));
-      optim_profile_->setDimensions(
-          input.first.c_str(), nvinfer1::OptProfileSelector::kMAX,
-          Vec2TRT_Dims(max_input_shape_[input.first], input.first, true));
-      optim_profile_->setDimensions(
-          input.first.c_str(), nvinfer1::OptProfileSelector::kOPT,
-          Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
+        VLOG(4) << "TRT dynamic_shape set " << input.first
+                << " min: " << Vec2Str(input.second)
+                << ", max: " << Vec2Str(max_input_shape_[input.first])
+                << ", opt: " << Vec2Str(optim_input_shape_[input.first]);
+
+        optim_profiles_[i]->setDimensions(
+            input.first.c_str(), nvinfer1::OptProfileSelector::kMIN,
+            Vec2TRT_Dims(input.second, input.first, true));
+        optim_profiles_[i]->setDimensions(
+            input.first.c_str(), nvinfer1::OptProfileSelector::kMAX,
+            Vec2TRT_Dims(max_input_shape_[input.first], input.first, true));
+        optim_profiles_[i]->setDimensions(
+            input.first.c_str(), nvinfer1::OptProfileSelector::kOPT,
+            Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
+      }
+      infer_builder_config_->addOptimizationProfile(optim_profiles_[i]);
     }
-    infer_builder_config_->addOptimizationProfile(optim_profile_);
     if (WithFp16() && disable_trt_plugin_fp16()) {
       LOG(INFO) << "NOTE: In order to achieve higher accuracy, you have "
                    "disabled the fp16 mode of TRT Plugin,\n"
@@ -237,7 +243,6 @@ void TensorRTEngine::FreezeNetwork() {
     }
 #endif
   }
-
 #if IS_TRT_VERSION_GE(8200)
   infer_builder_config_->setProfilingVerbosity(
       nvinfer1::ProfilingVerbosity::kDETAILED);
@@ -260,6 +265,13 @@ void TensorRTEngine::FreezeNetwork() {
                          "Build TensorRT cuda engine failed! Please recheck "
                          "you configurations related to paddle-TensorRT."));
 
+  binding_num_ = infer_engine_->getNbBindings();
+  // reset status for dynamic shape clone
+  if (max_profile_num_ > 1) {
+    infer_context_.clear();
+    cur_profile_num_ = 0;
+  }
+
   GetEngineInfo();
 }
 
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index e6f58c8c8e8f4..7aaeb739de194 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -253,10 +253,38 @@ class TensorRTEngine {
           infer_engine_,
           platform::errors::InvalidArgument(
               "You should build engine first and then set the context."));
+      // We may see trt warning: Profile 0 has been chosen by another
+      // IExecutionContext...
+      // It's ok. We will set it later.
       infer_context_[tid].reset(infer_engine_->createExecutionContext());
+      if (with_dynamic_shape_) {
+        // need new profile if it's not the first
+        if (cur_profile_num_ > 0) {
+          infer_context_[tid]->setOptimizationProfile(cur_profile_num_);
+        }
+        profile_index_[tid] = cur_profile_num_;
+        ++cur_profile_num_;
+      }
     }
     return infer_context_[tid].get();
   }
+
+  int GetProfileIndex() {
+    if (max_profile_num_ > 1) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      const std::thread::id tid = std::this_thread::get_id();
+      return profile_index_[tid];
+    } else {
+      return 0;
+    }
+  }
+
+  int GetBindingsOffset() {
+    return (binding_num_ / max_profile_num_) * GetProfileIndex();
+  }
+
+  int GetNbBindings() { return binding_num_; }
+
   void ResetContext() {
     std::unique_lock<std::mutex> lock(mutex_);
     const std::thread::id tid = std::this_thread::get_id();
@@ -322,6 +350,7 @@ class TensorRTEngine {
             "generating serialization file and doing inference are "
             "consistent."));
 
+    binding_num_ = infer_engine_->getNbBindings();
     GetEngineInfo();
   }
 
@@ -540,6 +569,7 @@ class TensorRTEngine {
     }
   }
 
+  void SetProfileNum(int num) { max_profile_num_ = num; }
   void GetEngineInfo() {
 #if IS_TRT_VERSION_GE(8200)
     std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
@@ -571,6 +601,9 @@ class TensorRTEngine {
   int batch_size_{-1};
 
   int device_id_;
+  int max_profile_num_{1};
+  int cur_profile_num_{0};
+  std::unordered_map<std::thread::id, int> profile_index_;
   ShapeMapType min_input_shape_;
   ShapeMapType max_input_shape_;
   ShapeMapType optim_input_shape_;
@@ -614,8 +647,9 @@ class TensorRTEngine {
   // For dynamic shape
   bool with_dynamic_shape_{false};
 #if IS_TRT_VERSION_GE(6000)
+  int binding_num_;
   infer_ptr<nvinfer1::IBuilderConfig> infer_builder_config_;
-  nvinfer1::IOptimizationProfile* optim_profile_;
+  std::vector<nvinfer1::IOptimizationProfile*> optim_profiles_;
   std::vector<std::unique_ptr<plugin::DynamicPluginTensorRT>> owned_pluginv2_;
 #endif
   std::mutex mutex_;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 6f0dec45644ef..8504474168d53 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -726,6 +726,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
       auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
       if (!(out_h > 0 && out_w > 0)) {
+        if (scale.size() < 2) return false;
         if (scale[0] <= 0.f || scale[1] <= 0.f) {
           VLOG(3) << "scale factor must be greater than 0 if out_h or out_w is "
                      "not set.";
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 313d58ce97ece..82f4420a2a04c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -32,9 +32,7 @@ namespace plugin {
 
 template <typename T>
 EmbEltwiseLayernormPluginDynamicImpl<
-    T>::~EmbEltwiseLayernormPluginDynamicImpl() {
-  this->terminate();
-}
+    T>::~EmbEltwiseLayernormPluginDynamicImpl() {}
 
 inline half fp32tofp16(float x) { return static_cast<half>(x); }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index cdf353465c818..8e59fc1355a75 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -192,11 +192,8 @@ bool QkvToContextPluginDynamic::supportsFormatCombination(
   if (pos == 0) {
     if (with_fp16_) {
 #ifdef TRT_PLUGIN_FP16_AVALIABLE
-      return (
-#if IS_TRT_VERSION_LT(8000)
-                 in.type == nvinfer1::DataType::kFLOAT ||
-#endif
-                 in.type == nvinfer1::DataType::kHALF) &&
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
              (in.format == nvinfer1::TensorFormat::kLINEAR);
 #else
       return (in.type == nvinfer1::DataType::kFLOAT) &&
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index 21e2660c9413d..fb14749f3d1db 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -73,11 +73,8 @@ bool SkipLayerNormPluginDynamic::supportsFormatCombination(
   if (pos == 0) {
     if (with_fp16_) {
 #ifdef TRT_PLUGIN_FP16_AVALIABLE
-      return (
-#if IS_TRT_VERSION_LT(8000)
-                 in.type == nvinfer1::DataType::kFLOAT ||
-#endif
-                 in.type == nvinfer1::DataType::kHALF) &&
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
              (in.format == nvinfer1::TensorFormat::kLINEAR);
 #else
       return (in.type == nvinfer1::DataType::kFLOAT) &&
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 2980aa2c7598c..2b6541c5515ce 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -83,11 +83,8 @@ SlicePlugin *SlicePlugin::clone() const TRT_NOEXCEPT {
 bool SlicePlugin::supportsFormat(
     nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
   if (with_fp16_) {
-    return ((
-#if IS_TRT_VERSION_LT(8000)
-                type == nvinfer1::DataType::kFLOAT ||
-#endif
-                type == nvinfer1::DataType::kHALF) &&
+    return ((type == nvinfer1::DataType::kFLOAT ||
+             type == nvinfer1::DataType::kHALF) &&
             (format == nvinfer1::PluginFormat::kLINEAR));
   } else {
     return ((type == nvinfer1::DataType::kFLOAT) &&
@@ -287,11 +284,8 @@ bool SlicePluginDynamic::supportsFormatCombination(
   const nvinfer1::PluginTensorDesc &in = in_out[pos];
   if (pos == 0) {
     if (with_fp16_) {
-      return (
-#if IS_TRT_VERSION_LT(8000)
-                 in.type == nvinfer1::DataType::kFLOAT ||
-#endif
-                 in.type == nvinfer1::DataType::kHALF) &&
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
              (in.format == nvinfer1::TensorFormat::kLINEAR);
     } else {
       return (in.type == nvinfer1::DataType::kFLOAT) &&
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
index 11de1a5a6fab4..4b2852be86149 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/inference/capi_exp/pd_config.h"
 #include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
@@ -34,6 +36,8 @@ void predictor_run() {
   PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
   PD_ConfigSwitchIrDebug(config, TRUE);
   PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+  PD_Cstr *config_summary = PD_ConfigSummary(config);
+  LOG(INFO) << config_summary->data;
 
   PD_Predictor *predictor = PD_PredictorCreate(config);
   PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data");
@@ -51,6 +55,7 @@ void predictor_run() {
 
   delete[] input;
   PD_TensorDestroy(tensor);
+  PD_CstrDestroy(config_summary);
   PD_PredictorDestroy(predictor);
 }
 
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
index 4f6742b88b28c..ccdf237ffa54d 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_test.cc
@@ -207,6 +207,87 @@ void TestTunedDynamic() {
   check_func(test_predictor.get());
 }
 
+void TestDynamicClone(bool with_dynamic = true, bool delete_cache = true,
+                      bool delete_conv_bn = false) {
+  std::string model_dir =
+      FLAGS_infer_model + "/conv_bn_swish_split_gelu/conv_bn_swish_split_gelu";
+
+  std::string opt_cache_dir = model_dir + "/my_cache";
+  if (delete_cache) {
+    delete_cache_files(opt_cache_dir);
+  }
+
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  std::string buffer_prog, buffer_param;
+  ReadBinaryFile(model_dir + "/model", &buffer_prog);
+  ReadBinaryFile(model_dir + "/params", &buffer_param);
+  config.SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
+                        buffer_param.size());
+  config.SetOptimCacheDir(opt_cache_dir);
+
+  config.SwitchUseFeedFetchOps(false);
+  // Set the input's min, max, opt shape
+  config.EnableTensorRtEngine(
+      1 << 30, 1, 1, AnalysisConfig::Precision::kFloat32, false, false);
+  if (delete_conv_bn) {
+    config.pass_builder()->DeletePass("conv_bn_fuse_pass");
+  }
+  if (with_dynamic) {
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {"image", {1, 1, 3, 3}}};
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {"image", {1, 1, 10, 10}}};
+    std::map<std::string, std::vector<int>> opt_input_shape = {
+        {"image", {1, 1, 3, 3}}};
+
+    config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                  opt_input_shape);
+  }
+  auto predictor = CreatePaddlePredictor(config);
+  auto input_names = predictor->GetInputNames();
+  int channels = 1;
+  int height = 3;
+  int width = 3;
+  int input_num = channels * height * width * 1;
+
+  float *input = new float[input_num];
+  memset(input, 0, input_num * sizeof(float));
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({1, channels, height, width});
+  input_t->copy_from_cpu(input);
+
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+
+  std::vector<float> out_data;
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data.resize(out_num);
+  output_t->copy_to_cpu(out_data.data());
+
+  auto predictor2 = predictor->Clone();
+  auto input_t2 = predictor2->GetInputTensor(input_names[0]);
+  input_t2->Reshape({1, channels, height, width});
+  input_t2->copy_from_cpu(input);
+
+  ASSERT_TRUE(predictor2->ZeroCopyRun());
+
+  std::vector<float> out_data2;
+  auto output_t2 = predictor2->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape2 = output_t2->shape();
+  int out_num2 = std::accumulate(output_shape2.begin(), output_shape2.end(), 1,
+                                 std::multiplies<int>());
+  out_data2.resize(out_num2);
+  output_t2->copy_to_cpu(out_data2.data());
+  ASSERT_TRUE(out_data2.size() == out_data.size());
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(out_data2[i], out_data[i], 1e-5);
+  }
+}
+
 TEST(AnalysisPredictor, trt_dynamic) { TestDynamic(true); }
 TEST(AnalysisPredictor, trt_static) { TestDynamic(false); }
 TEST(AnalysisPredictor, trt_memory_serialize) {
@@ -218,6 +299,7 @@ TEST(AnalysisPredictor, trt_memory_serialize) {
 TEST(AnalysisPredictor, trt_dynamic2) { TestDynamic2(); }
 
 TEST(AnalysisPredictor, trt_tuned_dynamic) { TestTunedDynamic(); }
+TEST(AnalysisPredictor, trt_dynamic_clone) { TestDynamicClone(); }
 
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index fcc76538b9b03..9d83f8ff8fdc4 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -53,8 +53,10 @@ if (WIN32)
     if (WITH_MKL)
       set(FLAG_OPENMP "/openmp")
     endif()
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4244 /wd4530")
     set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
     set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4530")
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
     safe_set_static_flag()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index a53c6a8dbeb12..9bc2f5461f383 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -26,6 +26,7 @@
 #include "paddle/fluid/platform/place.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <shared_mutex>
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
@@ -151,11 +152,12 @@ class AllocatorFacadePrivate {
         }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        if (FLAGS_use_stream_safe_cuda_allocator) {
-          LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
-                          "naive_best_fit strategy";
-          FLAGS_use_stream_safe_cuda_allocator = false;
-        }
+        PADDLE_ENFORCE_EQ(
+            FLAGS_use_stream_safe_cuda_allocator, false,
+            paddle::platform::errors::Unimplemented(
+                "StreamSafeCUDAAllocator is only implemented for auto_growth "
+                "strategy, not support naive_best_fit strategy"));
+
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -185,9 +187,6 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
         if (FLAGS_use_stream_safe_cuda_allocator) {
-          default_streams_ =
-              std::vector<gpuStream_t>(platform::GetGPUDeviceCount(), nullptr);
-          // TODO(Ruibiao): Support multi-stream allocator for other strategies
           for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
                ++dev_id) {
             InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr);
@@ -232,11 +231,11 @@ class AllocatorFacadePrivate {
         }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        if (FLAGS_use_stream_safe_cuda_allocator) {
-          LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
-                          "thread_local strategy";
-          FLAGS_use_stream_safe_cuda_allocator = false;
-        }
+        PADDLE_ENFORCE_EQ(
+            FLAGS_use_stream_safe_cuda_allocator, false,
+            paddle::platform::errors::Unimplemented(
+                "StreamSafeCUDAAllocator is only implemented for auto_growth "
+                "strategy, not support thread_local strategy"));
 
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
@@ -282,50 +281,45 @@ class AllocatorFacadePrivate {
   }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  bool HasCUDAAllocator(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream) {
+    auto it = cuda_allocators_.find(place);
+    if (it == cuda_allocators_.end()) {
+      return false;
+    }
+    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
+        it->second;
+    return allocator_map.find(stream) != allocator_map.end();
+  }
+
   const std::shared_ptr<Allocator>& GetAllocator(
       const platform::CUDAPlace& place, const gpuStream_t& stream,
       bool create_if_not_found = false) {
-    auto place_it = cuda_allocators_.find(place);
-    PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(),
-                      platform::errors::NotFound(
-                          "No allocator found for the place %s", place));
-
-    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
-        place_it->second;
-    auto stream_it = allocator_map.find(stream);
-    if (stream_it == allocator_map.end()) {
-      if (create_if_not_found) {
-        InitStreamSafeCUDAAllocator(place, stream);
+    {  // shared_lock_guard
+      std::shared_lock<std::shared_timed_mutex> lock_guard(
+          cuda_allocator_mutex_);
+      if (LIKELY(HasCUDAAllocator(place, stream))) {
         return cuda_allocators_[place][stream];
       } else {
-        PADDLE_THROW(platform::errors::NotFound(
-            "No allocator found for stream %s in place %s", stream, place));
+        PADDLE_ENFORCE_NE(create_if_not_found, false,
+                          platform::errors::NotFound(
+                              "No allocator found for stream %s in place %s "
+                              "with create_if_not_found = false",
+                              stream, place));
       }
     }
-    return stream_it->second;
-  }
 
-  const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) {
-    int dev_id = place.GetDeviceId();
-    gpuStream_t& default_stream = default_streams_[dev_id];
-    if (UNLIKELY(default_stream == nullptr)) {
-      /* NOTE(Ruibiao): Here if we set default_stream by code " default_stream =
-       * platform::stream::get_current_stream(place.GetDeviceId())->raw_stream()
-       * ", then it will be fail to make target 'jit_kernel_benchmark', says a
-       * undefined reference to `paddle::platform::DeviceContextPool::Get(
-       * paddle::platform::Place const&)' in function
-       * `paddle::platform::stream::get_current_stream(int)'. However, target
-       * allocator_facade will not be affected. It seems a circular dependency
-       * problem between 'cuda_stream' and 'device_context' that causes this
-       * strange bug.
-       */
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      default_stream =
-          static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
-      InitStreamSafeCUDAAllocator(place, default_stream);
+    {  // unique_lock_guard
+      std::unique_lock<std::shared_timed_mutex> lock_guard(
+          cuda_allocator_mutex_);
+      InitStreamSafeCUDAAllocator(place, stream);
+      return cuda_allocators_[place][stream];
     }
-    return default_stream;
+  }
+
+  gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
   }
 
   void RecordStream(std::shared_ptr<Allocation> allocation,
@@ -443,18 +437,12 @@ class AllocatorFacadePrivate {
             "Only support auto-growth strategey for StreamSafeCUDAAllocator, "
             "the allocator strategy %d is unsupported for multi-stream",
             static_cast<int>(strategy_)));
-    VLOG(9) << "Init CUDA allocator for stream " << stream << " in place " << p;
-    std::lock_guard<SpinLock> lock_guard(cuda_allocators_lock_);
-    try {
-      GetAllocator(p, stream);
-      VLOG(9) << "Other thread had build a allocator for stream " << stream
-              << " in place " << p;
-    } catch (platform::EnforceNotMet&) {
+    if (LIKELY(!HasCUDAAllocator(p, stream))) {
+      VLOG(8) << "Init CUDA allocator for stream " << stream << " in place "
+              << p;
       InitAutoGrowthCUDAAllocator(p, stream);
       WrapStreamSafeCUDAAllocator(p, stream);
       WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
-    } catch (...) {
-      throw;
     }
   }
 
@@ -618,7 +606,7 @@ class AllocatorFacadePrivate {
 
   void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
     const std::shared_ptr<Allocator>& underlying_allocator =
-        GetAllocator(p, stream);
+        cuda_allocators_[p][stream];
     cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
         underlying_allocator, p, stream);
   }
@@ -629,7 +617,7 @@ class AllocatorFacadePrivate {
         retry_time, 0,
         platform::errors::InvalidArgument(
             "Retry time should be larger than 0, but got %d", retry_time));
-    std::shared_ptr<Allocator> allocator = GetAllocator(p, stream);
+    std::shared_ptr<Allocator> allocator = cuda_allocators_[p][stream];
     allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
   }
 
@@ -702,7 +690,7 @@ class AllocatorFacadePrivate {
 #ifdef PADDLE_WITH_MLU
     int device_count = platform::GetMLUDeviceCount();
     for (int i = 0; i < device_count; ++i) {
-      platform::XPUPlace p(i);
+      platform::MLUPlace p(i);
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
 #endif
@@ -784,8 +772,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;
-  std::vector<gpuStream_t> default_streams_;
-  SpinLock cuda_allocators_lock_;
+  std::shared_timed_mutex cuda_allocator_mutex_;
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
       cuda_graph_allocator_map_;
@@ -879,9 +866,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
       ->Release(place);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
-    const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) {
+    const platform::Place& place, size_t size, const platform::Stream& stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -896,12 +883,41 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
         "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
   }
 #endif
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return std::shared_ptr<Allocation>(Alloc(place, size, s));
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
+#endif
+}
+
+bool AllocatorFacade::InSameStream(
+    const std::shared_ptr<Allocation>& allocation,
+    const platform::Stream& stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'InSameStream' function. To enable it, you can enter"
+          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
+          "terminal."));
 
-  return std::shared_ptr<Allocation>(Alloc(place, size, stream));
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return s == GetStream(allocation);
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
+#endif
 }
 
-AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
-                                     size_t size, const gpuStream_t& stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
+                                     const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -917,11 +933,12 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
   }
 #endif
 
+  platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place);
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
-    return m_->GetAllocator(place, stream, /* create_if_not_found = */ true)
+    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
         ->Allocate(size);
   } else {
-    return m_->GetAllocator(place, size)->Allocate(size);
+    return m_->GetAllocator(p, size)->Allocate(size);
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 4c4f805a0c619..d59ecaece5a70 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace memory {
@@ -57,21 +58,27 @@ class AllocatorFacade {
   // Release unused memory pool.
   uint64_t Release(const platform::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+  std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size,
-                                          const gpuStream_t& stream);
-  AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                                          const platform::Stream& stream);
+
+  bool InSameStream(const std::shared_ptr<Allocation>& allocation,
+                    const platform::Stream& stream);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // TODO(zhiqiu): change gpuStream_t to platform::Stream if needed.
+  AllocationPtr Alloc(const platform::Place& place, size_t size,
                       const gpuStream_t& stream);
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(std::shared_ptr<Allocation> allocation,
                     const gpuStream_t& stream);
   const gpuStream_t& GetStream(
       const std::shared_ptr<Allocation>& allocation) const;
+#endif
+
 #ifdef PADDLE_WITH_CUDA
   void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
   void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
-#endif
 #endif
 
   // TODO(yy): Allocate a Copy-On-Write allocation?
diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/allocation/base_ptr_test.cu
index 1b284c9899dbb..a34750a5e34ba 100644
--- a/paddle/fluid/memory/allocation/base_ptr_test.cu
+++ b/paddle/fluid/memory/allocation/base_ptr_test.cu
@@ -35,7 +35,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   void OneByOneAllocTest() {
     for (size_t i = 0; i < alloc_times_; ++i) {
       size_t size = dis_(random_engine_);
-      std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+      AllocationPtr allocation = Alloc(place_, size);
 
       void* base_ptr = allocation->base_ptr();
       void* system_ptr =
@@ -47,21 +47,21 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   }
 
   void BatchByBatchAllocTest() {
-    std::vector<std::shared_ptr<Allocation>> allocations;
+    std::vector<AllocationPtr> allocations;
     allocations.reserve(batch_size_);
     size_t batch_num = alloc_times_ / batch_size_;
 
     for (size_t i = 0; i < batch_num; ++i) {
       for (size_t j = 0; j < batch_size_; ++j) {
         size_t size = dis_(random_engine_);
-        std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+        AllocationPtr allocation = Alloc(place_, size);
 
         void* base_ptr = allocation->base_ptr();
         void* system_ptr =
             platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
         EXPECT_EQ(base_ptr, system_ptr);
 
-        allocations.emplace_back(allocation);
+        allocations.emplace_back(std::move(allocation));
       }
       allocations.clear();
     }
@@ -70,19 +70,19 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   }
 
   void ContinuousAllocTest() {
-    std::vector<std::shared_ptr<Allocation>> allocations;
+    std::vector<AllocationPtr> allocations;
     allocations.reserve(alloc_times_);
 
     for (size_t i = 0; i < alloc_times_; ++i) {
       size_t size = dis_(random_engine_);
-      std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+      AllocationPtr allocation = Alloc(place_, size);
 
       void* base_ptr = allocation->base_ptr();
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
 
-      allocations.emplace_back(allocation);
+      allocations.emplace_back(std::move(allocation));
     }
 
     allocations.clear();
@@ -90,7 +90,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
   }
 
   void ZeroSizeAllocTest() {
-    std::shared_ptr<Allocation> allocation = AllocShared(place_, 0);
+    AllocationPtr allocation = Alloc(place_, 0);
     void* base_ptr = allocation->base_ptr();
     void* system_ptr =
         platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index d2319dacdd33f..8710bbe6ce98b 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -29,15 +29,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     init_allocated_mem, false,
@@ -153,24 +145,9 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 #ifdef PADDLE_WITH_XPU
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void *p = nullptr;
-  int dev_id = -1;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  ret = xpu_set_device(place.device);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+
+  platform::XPUDeviceGuard gurad(place.device);
+  int ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
   if (ret != XPU_SUCCESS) {
     std::cout << "xpu memory malloc(" << size << ") failed, try again\n";
     xpu_wait();
@@ -184,12 +161,6 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "xpu memory FLAGS_init_allocated_mem is not implemented."));
   }
-  ret = xpu_set_device(dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
   VLOG(10) << "  pointer=" << p;
   return p;
 #else
@@ -205,30 +176,9 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
 #ifdef PADDLE_WITH_XPU
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  int dev_id = -1;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  ret = xpu_set_device(place.device);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+
+  platform::XPUDeviceGuard gurad(place.device);
   xpu_free(p);
-  ret = xpu_set_device(dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
 #else
   PADDLE_THROW(
       platform::errors::PermissionDenied("'XPUPlace' is not supported."));
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 0d0318859c626..a4f766f1d1abc 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -118,6 +118,7 @@ bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
 Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   ProcessUnfreedAllocations();
+  VLOG(8) << "Try allocate " << size << " bytes";
   AllocationPtr underlying_allocation;
   try {
     underlying_allocation = underlying_allocator_->Allocate(size);
@@ -150,10 +151,12 @@ void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
                               "StreamSafeCUDAAllocation*",
                               allocation));
   VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
+  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
   if (stream_safe_cuda_allocation->CanBeFreed()) {
+    VLOG(9) << "Directly delete allocation";
     delete stream_safe_cuda_allocation;
   } else {
-    std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
+    VLOG(9) << "Put into unfreed_allocation list";
     unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
   }
 }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index e9b715c5cc3cf..96fcd6254d885 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
+
 #include <algorithm>
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MLU)
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
+    defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL)
+#define USE_DEVICE
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
+
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
@@ -180,33 +180,24 @@ uint64_t BuddyAllocator::Release() {
   std::lock_guard<std::mutex> lock(mutex_);
   int num = 0;
   uint64_t bytes = 0;
-  bool del_flag = false;
   for (auto iter = pool_.begin(); iter != pool_.end();) {
     auto remain_size = std::get<1>(*iter);
     auto remain_ptr = std::get<2>(*iter);
-    for (auto& chunk : chunks_) {
-      auto init_size = std::get<1>(chunk);
-      auto init_ptr = std::get<2>(chunk);
-
-      if (init_size == remain_size && init_ptr == remain_ptr) {
-        ++num;
-        bytes += init_size;
-        total_free_ -= init_size;
-        auto block = static_cast<MemoryBlock*>(std::get<2>(chunk));
-        system_allocator_->Free(init_ptr, init_size, std::get<0>(chunk));
-        cache_.Invalidate(block);
-        del_flag = true;
-        break;
-      }
-    }
-
-    if (del_flag) {
+    auto found = chunks_.find({remain_size, remain_ptr});
+    if (found != chunks_.end()) {
+      size_t index = found->second;
+      ++num;
+      bytes += remain_size;
+      total_free_ -= remain_size;
+      auto block = static_cast<MemoryBlock*>(remain_ptr);
+      system_allocator_->Free(remain_ptr, remain_size, index);
+      cache_.Invalidate(block);
       iter = pool_.erase(iter);
     } else {
       iter++;
     }
   }
-  VLOG(10) << "Release " << num << " chunk, Free " << bytes << " bytes.";
+  VLOG(10) << "Release " << num << " chunks, Free " << bytes << " bytes.";
   return bytes;
 }
 
@@ -234,49 +225,15 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   size_t index = 0;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the allocation size for gpu for the first allocation.
-      allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
-    } else {
-      // Compute the re-allocation size, we store the re-allocation size when
-      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
-      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
-        realloc_size_ = platform::GpuReallocSize();
-      }
-      allocate_bytes = std::max(realloc_size_, request_bytes);
-    }
-  }
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the allocation size for gpu for the first allocation.
-      allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
-    } else {
-      // Compute the re-allocation size, we store the re-allocation size when
-      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
-      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
-        realloc_size_ = platform::NPUReallocSize();
-      }
-      allocate_bytes = std::max(realloc_size_, request_bytes);
-    }
-  }
-#endif
-#ifdef PADDLE_WITH_MLU
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the allocation size for mlu for the first allocation.
-      allocate_bytes = std::max(platform::MLUInitAllocSize(), request_bytes);
-    } else {
-      // Compute the re-allocation size, we store the re-allocation size when
-      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
-      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
-        realloc_size_ = platform::MLUReallocSize();
-      }
-      allocate_bytes = std::max(realloc_size_, request_bytes);
-    }
-  }
+  allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize,
+                                      &platform::GpuReallocSize, request_bytes);
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  allocate_bytes = DeviceAllocateSize(&platform::NPUInitAllocSize,
+                                      &platform::NPUReallocSize, request_bytes);
+#elif defined(PADDLE_WITH_MLU)
+  allocate_bytes =
+      DeviceAllocateSize(&platform::MLUInitAllocSize(),
+                         &platform::MLUReallocSize(), request_bytes);
 #endif
 
   // Allocate a new block
@@ -293,7 +250,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   total_free_ += allocate_bytes;
 
   // record the chunk.
-  chunks_.insert(IndexSizeAddress(index, allocate_bytes, p));
+  chunks_.insert({{allocate_bytes, p}, index});
 
   // dump the block into pool
   return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
@@ -350,6 +307,31 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   return block;
 }
 
+size_t BuddyAllocator::DeviceAllocateSize(
+    std::function<size_t()> init_allocate_size_func,
+    std::function<size_t()> re_allocate_size_func, size_t request_bytes) {
+  size_t allocate_bytes = max_chunk_size_;
+#if defined(USE_DEVICE)
+  const bool use_gpu = system_allocator_->UseGpu();
+  VLOG(10) << "use_gpu " << use_gpu << ", total_used " << total_used_
+           << ", total_free " << total_free_;
+  if (use_gpu) {
+    if (total_used_ == 0 && total_free_ == 0) {
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(init_allocate_size_func(), request_bytes);
+    } else {
+      // Compute the re-allocation size, we store the re-allocation size when
+      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
+      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
+        realloc_size_ = re_allocate_size_func();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
+    }
+  }
+#endif
+  return allocate_bytes;
+}
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index b7be895b35830..0d736f680503a 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -15,11 +15,14 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
+#include <functional>
+#include <map>
 #include <memory>
 #include <mutex>  // NOLINT
 #include <set>
 #include <tuple>
-#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/memory/detail/memory_block.h"
@@ -59,6 +62,9 @@ class BuddyAllocator {
   using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
   // Each element in PoolSet is a free allocation
   using PoolSet = std::set<IndexSizeAddress>;
+  // Each element in PoolMap is an allocation record
+  // key: <size, ptr>, value: index
+  using PoolMap = std::map<std::pair<size_t, void*>, size_t>;
 
   /*! \brief Allocate fixed-size memory from system */
   void* SystemAlloc(size_t size);
@@ -80,6 +86,11 @@ class BuddyAllocator {
   /*! \brief Find the existing chunk which used to allocation */
   PoolSet::iterator FindExistChunk(size_t size);
 
+  /*! \brief Allocate bytes from the device */
+  size_t DeviceAllocateSize(std::function<size_t()> init_allocate_size_func,
+                            std::function<size_t()> re_allocate_size_func,
+                            size_t request_bytes);
+
  private:
   size_t total_used_ = 0;  // the total size of used memory
   size_t total_free_ = 0;  // the total size of free memory
@@ -102,7 +113,7 @@ class BuddyAllocator {
   /**
    * \brief Record the allocated chunks when Refill pool.
    */
-  PoolSet chunks_;
+  PoolMap chunks_;
 
  private:
   /*! Unify the metadata format between GPU and CPU allocations */
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 7d19115940fee..a6b7f497bafca 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -95,10 +95,10 @@ TEST(BuddyAllocator, GpuFraction) {
   // Less than pool size
   TestBuddyAllocator(&buddy_allocator, 10);
   TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  TestBuddyAllocator(&buddy_allocator, 1 << 20);
 
   // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator, 300 << 20,
+  TestBuddyAllocator(&buddy_allocator, 500 << 20,
                      /* use_system_allocator = */ true);
   TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
                      /* use_system_allocator = */ true);
@@ -189,6 +189,35 @@ TEST(BuddyAllocator, FractionRefillPool) {
   buddy_allocator.Free(p1);
 }
 
+TEST(BuddyAllocator, DeviceRefillPool) {
+  const size_t malloc_size = 10;
+  const size_t malloc_bytes = malloc_size << 20;
+  FLAGS_initial_gpu_memory_in_mb = malloc_size;
+  FLAGS_reallocate_gpu_memory_in_mb = malloc_size;
+
+  EXPECT_EQ(platform::GpuMaxChunkSize(), malloc_bytes);
+
+  size_t max_chunk_size = platform::GpuMaxChunkSize();
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
+      platform::GpuMinChunkSize(), max_chunk_size);
+
+  int* p0 = TestBuddyAllocator(&buddy_allocator, malloc_bytes - 1000,
+                               /* use_system_allocator = */ false,
+                               /* free_ptr = */ false);
+  // Max chunk size should be same during allocation
+  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
+
+  int* p1 = TestBuddyAllocator(&buddy_allocator, malloc_bytes - 1000,
+                               /* use_system_allocator = */ false,
+                               /* free_ptr = */ false);
+  // Max chunk size should be same during allocation
+  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
+
+  buddy_allocator.Free(p0);
+  buddy_allocator.Free(p1);
+}
+
 TEST(BuddyAllocator, AllocFromAvailable) {
   FLAGS_fraction_of_gpu_memory_to_use = 0.7;
   FLAGS_initial_gpu_memory_in_mb = 0;
@@ -350,7 +379,6 @@ TEST(BuddyAllocator, Release) {
 #ifdef PADDLE_WITH_ASCEND_CL
 TEST(BuddyAllocator, NpuFraction) {
   // In a 16 GB machine, the pool size will be about 160 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.005;
   FLAGS_fraction_of_gpu_memory_to_use = 0.92;
   FLAGS_initial_gpu_memory_in_mb = 0;
   FLAGS_reallocate_gpu_memory_in_mb = 0;
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index d818459fb03a0..dbf3fad6c3373 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -19,12 +19,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#endif
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/enforce.h"
-#endif
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 DECLARE_bool(use_pinned_memory);
 
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 5ec96c39bb604..3e859377e98d8 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace memory {
@@ -33,14 +34,20 @@ uint64_t Release(const platform::Place& place) {
   return allocation::AllocatorFacade::Instance().Release(place);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                         size_t size,
-                                        const gpuStream_t& stream) {
+                                        const platform::Stream& stream) {
   return allocation::AllocatorFacade::Instance().AllocShared(place, size,
                                                              stream);
 }
 
+bool InSameStream(const std::shared_ptr<Allocation>& allocation,
+                  const platform::Stream& stream) {
+  return allocation::AllocatorFacade::Instance().InSameStream(allocation,
+                                                              stream);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
                     const gpuStream_t& stream) {
   return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 7ca15c5dfc127..7069fb46203d6 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 
@@ -40,11 +41,14 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
 
 extern uint64_t Release(const platform::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-extern std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                size_t size,
-                                               const gpuStream_t& stream);
+                                               const platform::Stream& stream);
 
+extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
+                         const platform::Stream& stream);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
                            const gpuStream_t& stream);
 
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 2814f2f9501a8..4a10922adbf75 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -14,18 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
 
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
-#endif
-
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-
 namespace paddle {
 namespace memory {
 
@@ -74,41 +66,7 @@ void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
     VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
-  int dev_id = -1;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  if (dev_id != dst_place.device) {
-    ret = xpu_set_device(dst_place.device);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-  }
-  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id != dst_place.device) {
-    ret = xpu_set_device(dev_id);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-  }
+  platform::MemcpySyncH2D(dst, src, num, dst_place.device);
 }
 
 template <>
@@ -120,46 +78,7 @@ void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
     VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
     return;
   }
-  int dev_id = -1;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  if (dev_id != src_place.device) {
-    ret = xpu_set_device(src_place.device);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-  }
-
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.GetByPlace(src_place);
-  dev_ctx->Wait();
-
-  ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id != src_place.device) {
-    ret = xpu_set_device(dev_id);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-  }
+  platform::MemcpySyncD2H(dst, src, num, src_place.device);
 }
 
 template <>
@@ -171,69 +90,7 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
     VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
-  int dev_id = -1;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  if (dev_id != src_place.device || dev_id != dst_place.device) {
-    ret = xpu_set_device(src_place.device);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-    void* tmp = malloc(num);
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* dev_ctx = pool.GetByPlace(src_place);
-    dev_ctx->Wait();
-
-    ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-    ret = xpu_set_device(dst_place.device);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-    ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-    ret = xpu_set_device(dev_id);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
-    free(tmp);
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto* dev_ctx = pool.GetByPlace(src_place);
-    int ret = xpu::copy(dev_ctx->x_context(), static_cast<const int8_t*>(src),
-                        static_cast<int8_t*>(dst), num);
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
-                                            "XPU API return wrong value[%d %s]",
-                                            ret, XPUAPIErrorMsg[ret]));
-  }
+  platform::MemcpySyncD2D(dst, dst_place.device, src, src_place.device, num);
 }
 #endif
 
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 52c3825053ca2..bb44b29ac5b01 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -30,6 +30,7 @@
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace memory {
@@ -69,8 +70,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
 #endif
 
-      std::shared_ptr<Allocation> allocation =
-          AllocShared(place_, workspace_size_, stream);
+      std::shared_ptr<Allocation> allocation = AllocShared(
+          place_, workspace_size_,
+          platform::Stream(reinterpret_cast<platform::StreamId>(stream)));
 #ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemset(allocation->ptr(), 0, allocation->size()));
@@ -83,7 +85,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
       workspaces_.emplace_back(allocation);
     }
 
-    result_ = AllocShared(place_, stream_num_ * workspace_size_);
+    result_ = Alloc(place_, stream_num_ * workspace_size_);
   }
 
   void SingleStreamRun(size_t idx) {
@@ -183,7 +185,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
   platform::CUDAPlace place_;
   std::vector<gpuStream_t> streams_;
   std::vector<std::shared_ptr<Allocation>> workspaces_;
-  std::shared_ptr<Allocation> result_;
+  allocation::AllocationPtr result_;
 };
 
 TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
@@ -223,22 +225,23 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
+  size_t alloc_size = 256;
+
+  allocation::AllocationPtr allocation_implicit_stream =
+      Alloc(place, alloc_size);
+  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
+  void *address = allocation_implicit_stream->ptr();
+  allocation_implicit_stream.reset();
+
   auto &instance = allocation::AllocatorFacade::Instance();
   const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);
 
-  size_t alloc_size = 256;
-  std::shared_ptr<Allocation> allocation_from_allocator =
+  allocation::AllocationPtr allocation_from_allocator =
       allocator->Allocate(alloc_size);
   EXPECT_GE(allocation_from_allocator->size(), alloc_size);
-  void *address = allocation_from_allocator->ptr();
+  EXPECT_EQ(allocation_from_allocator->ptr(), address);
   allocation_from_allocator.reset();
 
-  std::shared_ptr<Allocation> allocation_implicit_stream =
-      AllocShared(place, alloc_size);
-  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
-  EXPECT_EQ(allocation_implicit_stream->ptr(), address);
-  allocation_implicit_stream.reset();
-
   Release(place);
   CheckMemLeak(place);
 }
@@ -283,8 +286,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&new_stream));
 #endif
 
-  std::shared_ptr<Allocation> allocation_new_stream =
-      AllocShared(place, alloc_size, new_stream);
+  std::shared_ptr<Allocation> allocation_new_stream = AllocShared(
+      place, alloc_size,
+      platform::Stream(reinterpret_cast<platform::StreamId>(new_stream)));
   EXPECT_EQ(GetStream(allocation_new_stream), new_stream);
 
 #ifdef PADDLE_WITH_CUDA
@@ -311,7 +315,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
   EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
   EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
                paddle::platform::EnforceNotMet);
-  EXPECT_THROW(AllocShared(place, alloc_size, nullptr),
+  EXPECT_THROW(AllocShared(place, alloc_size,
+                           platform::Stream(
+                               reinterpret_cast<platform::StreamId>(nullptr))),
                paddle::platform::EnforceNotMet);
   EXPECT_THROW(Alloc(place, alloc_size, nullptr),
                paddle::platform::EnforceNotMet);
@@ -342,13 +348,12 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   // so the second alloc will fail and retry
   size_t alloc_size = available_size / 4 * 3;
 
-  std::shared_ptr<Allocation> allocation1 =
-      AllocShared(place, alloc_size, stream1);
-  std::shared_ptr<Allocation> allocation2;
+  allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
+  allocation::AllocationPtr allocation2;
 
   std::thread th([&allocation2, &place, &stream2, alloc_size]() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = AllocShared(place, alloc_size, stream2);
+    allocation2 = Alloc(place, alloc_size, stream2);
   });
   allocation1.reset();  // free but not release
   th.join();
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 2cb80068b0d6e..01d6c9322883b 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -91,7 +91,13 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten pten_api_utils)
+if (WITH_ROCM)
+    hip_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
+else()
+    cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
+endif()
+
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten pten_api_utils gather_scatter_kernel)
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 1d0dda7cd6626..c5ca1fd0e8cab 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -806,6 +806,36 @@ Swish Activation Operator.
   }
 };
 
+class MishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of Mish operator");
+    AddOutput("Out", "Output of Mish operator");
+    AddAttr<float>(
+        "threshold",
+        "Constant threshold of softplus in Mish operator. Approximate value "
+        "of softplus will be used if absolute value of input is greater than "
+        ":attr:`threshold`")
+        .SetDefault(20.f);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false)
+        .AsExtra();
+    AddComment(R"DOC(
+Mish Activation Operator.
+
+..  math::
+    softplus(x) = \begin{cases}
+            x, \text{if } x > \text{threshold} \\
+            \ln(1 + e^{x}),  \text{otherwise}
+          \end{cases}
+
+    out = x * \tanh(softplus(x))
+
+)DOC");
+  }
+};
+
 class HardSwishOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -1901,4 +1931,11 @@ REGISTER_OP_VERSION(softplus)
             .NewAttr("threshold", "The threshold value of the new formula",
                      20.0f));
 
+REGISTER_OP_VERSION(mish)
+    .AddCheckpoint(
+        R"ROC(add new attributes [use_mkldnn], and when computing softplus the formula is changed as the new veriosn of softplus)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "use_mkldnn", "(bool, default false) Only used in mkldnn kernel",
+            false));
+
 /* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 4818043b93be2..342ed3a6b19e2 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1145,6 +1145,55 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct CudaMishFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // mish(x) = x * tanh(softplus(x))
+  // softplus(x) = x, if x > threshold
+  //             = ln(1 + exp(x)), otherwise
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T& arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
+    return static_cast<T>(x * tanh(sp));
+  }
+};
+
+template <typename T>
+struct CudaMishGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
+  // sp = softplus(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T& arg_dout,
+                                          const T& arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType sp = (x > static_cast<MPType>(threshold)) ? x : log(one + exp(x));
+    MPType gsp =
+        (x > static_cast<MPType>(threshold)) ? one : one / (one + exp(-x));
+    MPType tsp = tanh(sp);
+    return static_cast<T>(dout * (tsp + x * (one - tsp * tsp) * gsp));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
@@ -1808,6 +1857,7 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
           CudaHardSigmoidGradFunctor);                                        \
   __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
+  __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor);                  \
   __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor,      \
           CudaThresholdedReluGradFunctor);                                    \
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 9ba49e598ed5c..6e32860d69c62 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1412,6 +1412,46 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+// mish(x) = x * tanh(softplus(x))
+// softplus(x) = x, if x > threshold
+//             = ln(1 + exp(x)), otherwise
+template <typename T>
+struct MishFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    auto sp = (x > static_cast<T>(threshold))
+                  .select(x, (static_cast<T>(1) + x.exp()).log());
+    out.device(d) = x * sp.tanh();
+  }
+};
+
+// dx = dout * (tanh(sp) + x * (1 - tanh(sp) ** 2) * (1 - exp(-sp)))
+// sp = softplus(x)
+template <typename T>
+struct MishGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    auto sp = (x > static_cast<T>(threshold))
+                  .select(x, (static_cast<T>(1) + x.exp()).log());
+    auto gsp = static_cast<T>(1) - (-sp).exp();
+    auto tsp = sp.tanh();
+    dx.device(d) = dout * (tsp + x * (static_cast<T>(1) - tsp * tsp) * gsp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 // softsign(x) = x / (1 + |x|)
 template <typename T>
 struct SoftsignFunctor : public BaseActivationFunctor<T> {
@@ -2841,4 +2881,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
   __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
   __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
           ThresholdedReluGradFunctor);                                        \
+  __macro(mish, Mish, MishFunctor, MishGradFunctor);                          \
   __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc
index 1ad581cf4ca2b..caa498faddaa1 100644
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
@@ -27,40 +27,37 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename DeviceContext, cnnlActivationMode_t act_mode, typename T>
+template <cnnlActivationMode_t act_mode, typename T>
 class ActivationMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
 
     output->mutable_data<T>(ctx.GetPlace());
 
-    MLUCnnlActivationDesc act_desc(act_mode, alpha_);
+    MLUCnnlActivationDesc act_desc(act_mode, alpha);
     MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
                                  ToCnnlDataType(input->type()));
     MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
                                   ToCnnlDataType(output->type()));
 
-    MLUCnnl::Active(dev_ctx, act_desc.get(), input_desc.get(),
+    MLUCnnl::Active(ctx, act_desc.get(), input_desc.get(),
                     reinterpret_cast<const void*>(input->data<T>()),
                     output_desc.get(),
                     reinterpret_cast<void*>(output->data<T>()));
   }
-
- private:
-  float alpha_ = 1.0;
 };
 
-template <typename DeviceContext, cnnlActivationMode_t act_mode, typename T>
+template <cnnlActivationMode_t act_mode, typename T>
 class ActivationGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* out = ctx.Input<Tensor>("Out");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
 
     dx->mutable_data<T>(ctx.GetPlace());
 
@@ -70,16 +67,13 @@ class ActivationGradMLUKernel : public framework::OpKernel<T> {
                                ToCnnlDataType(out->type()));
     MLUCnnlTensorDesc dx_desc(*dx, CNNL_LAYOUT_ARRAY,
                               ToCnnlDataType(dx->type()));
-    MLUCnnlActivationDesc act_desc(act_mode, alpha_);
+    MLUCnnlActivationDesc act_desc(act_mode, alpha);
     MLUCnnl::ActiveGrad(
-        dev_ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr,
+        ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr,
         dout_desc.get(), reinterpret_cast<const void*>(dout->data<T>()),
         out_desc.get(), reinterpret_cast<const void*>(out->data<T>()),
         dx_desc.get(), reinterpret_cast<void*>(dx->data<T>()));
   }
-
- private:
-  float alpha_ = 1.0;
 };
 
 }  // namespace operators
@@ -88,13 +82,9 @@ class ActivationGradMLUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_MLU_KERNEL(
-    relu, ops::ActivationMLUKernel<paddle::platform::MLUDeviceContext,
-                                   CNNL_ACTIVATION_RELU, float>,
-    ops::ActivationMLUKernel<paddle::platform::MLUDeviceContext,
-                             CNNL_ACTIVATION_RELU, paddle::platform::float16>);
+    relu, ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, float>,
+    ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, paddle::platform::float16>);
 REGISTER_OP_MLU_KERNEL(
-    relu_grad, ops::ActivationGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                            CNNL_ACTIVATION_RELU, float>,
-    ops::ActivationGradMLUKernel<paddle::platform::MLUDeviceContext,
-                                 CNNL_ACTIVATION_RELU,
+    relu_grad, ops::ActivationGradMLUKernel<CNNL_ACTIVATION_RELU, float>,
+    ops::ActivationGradMLUKernel<CNNL_ACTIVATION_RELU,
                                  paddle::platform::float16>);
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
index fe85eb26705d1..60188ee53ef07 100644
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -98,29 +98,29 @@ void xpu_activation_backward(
 }
 
 template <typename T>
-struct XPUReluFunctor : public BaseActivationFunctor<T> {
+struct XPUAbsFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu<XPUType>);
+        ctx, xpu::abs<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
+struct XPUAbsGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sigmoid<XPUType>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::abs_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUTanhFunctor : public BaseActivationFunctor<T> {
+struct XPUExpFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::tanh<XPUType>);
+        ctx, xpu::exp<XPUType>);
   }
 };
 
@@ -134,119 +134,83 @@ struct XPULogFunctor : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-struct XPUSquareFunctor : public BaseActivationFunctor<T> {
+struct XPUReciprocalFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::square<XPUType>);
+        ctx, xpu::reciprocal<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
+struct XPUReciprocalGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sqrt<XPUType>);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::reciprocal_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUAbsFunctor : public BaseActivationFunctor<T> {
+struct XPUReluFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::abs<XPUType>);
+        ctx, xpu::relu<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUPowFunctor : public BaseActivationFunctor<T> {
+struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    const auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Output<Tensor>("Out");
-    auto pow_factor = ctx.Attr<float>("factor");
-    const T *x_data = x->data<T>();
-    T *y_data = y->mutable_data<T>(ctx.GetPlace());
-    T *factor_data = nullptr;
-
-    auto xpu_context =
-        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&factor_data),
-                                 x->numel() * sizeof(T)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-    int r = xpu::constant<T>(xpu_context, factor_data, x->numel(), pow_factor);
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External("XPU constant op return"
-                                   " wrong value[%d %s] in pow op.",
-                                   r, XPUAPIErrorMsg[r]));
-    r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel());
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU pow op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
-    if (xpu_context->xpu_stream != nullptr) {
-      xpu_wait(xpu_context->xpu_stream);
-    }
-    xpu_free(factor_data);
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::relu_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
+struct XPUSigmoidFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-    PADDLE_ENFORCE_EQ(threshold, 6.0f,
-                      platform::errors::External(
-                          "Not support threshold [%f] in XPU", threshold));
-    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
-                                       "Not support scale [%f] in XPU", scale));
-    PADDLE_ENFORCE_EQ(
-        offset, 3.0f,
-        platform::errors::External("Not support offset [%f] in XPU", offset));
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::hard_swish<XPUType>);
+        ctx, xpu::sigmoid<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu_grad<XPUType>);
+        ctx, xpu::sigmoid_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSqrtFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::tanh_grad<XPUType>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::sqrt<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUSigmoidGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sigmoid_grad<XPUType>);
+        ctx, xpu::sqrt_grad<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUSqrtGradFunctor : public BaseActivationFunctor<T> {
+struct XPUSquareFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
-    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::sqrt_grad<XPUType>);
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::square<XPUType>);
   }
 };
 
@@ -259,6 +223,44 @@ struct XPUSquareGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct XPUTanhFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::tanh<XPUType>);
+  }
+};
+
+template <typename T>
+struct XPUTanhGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::tanh_grad<XPUType>);
+  }
+};
+
+template <typename T>
+struct XPUHardSwishFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    float threshold = ctx.Attr<float>("threshold");
+    float scale = ctx.Attr<float>("scale");
+    float offset = ctx.Attr<float>("offset");
+    PADDLE_ENFORCE_EQ(threshold, 6.0f,
+                      platform::errors::External(
+                          "Not support threshold [%f] in XPU", threshold));
+    PADDLE_ENFORCE_EQ(scale, 6.0f, platform::errors::External(
+                                       "Not support scale [%f] in XPU", scale));
+    PADDLE_ENFORCE_EQ(
+        offset, 3.0f,
+        platform::errors::External("Not support offset [%f] in XPU", offset));
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::hard_swish<XPUType>);
+  }
+};
+
 template <typename T>
 struct XPUHardSwishGradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
@@ -328,6 +330,40 @@ struct XPULeakyReluGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct XPUPowFunctor : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    const auto *x = ctx.Input<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Out");
+    auto pow_factor = ctx.Attr<float>("factor");
+    const T *x_data = x->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *factor_data = nullptr;
+
+    auto xpu_context =
+        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&factor_data),
+                                 x->numel() * sizeof(T)),
+                      XPU_SUCCESS, platform::errors::ResourceExhausted(
+                                       "XPU has no enough memory"));
+    int r = xpu::constant<T>(xpu_context, factor_data, x->numel(), pow_factor);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU constant op return"
+                                   " wrong value[%d %s] in pow op.",
+                                   r, XPUAPIErrorMsg[r]));
+    r = xpu::pow(xpu_context, x_data, factor_data, y_data, x->numel());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU pow op return wrong value[%d %s].", r,
+                                   XPUAPIErrorMsg[r]));
+    if (xpu_context->xpu_stream != nullptr) {
+      xpu_wait(xpu_context->xpu_stream);
+    }
+    xpu_free(factor_data);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -340,15 +376,18 @@ namespace ops = paddle::operators;
       act_type##_grad,                                                   \
       ops::XPUActivationGradKernel<ops::grad_functor<float>>);
 
+REGISTER_ACTIVATION_XPU_KERNEL(abs, XPUAbsFunctor, XPUAbsGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
+                               XPUHardSwishGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor,
+                               XPULeakyReluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, XPUReciprocalFunctor,
+                               XPUReciprocalGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
                                XPUSigmoidGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
-                               XPUHardSwishGradFunctor)
-REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, XPULeakyReluFunctor,
-                               XPULeakyReluGradFunctor)
 
 REGISTER_OP_XPU_KERNEL(
     tanh, ops::XPUActivationKernel<ops::XPUTanhFunctor<float>>,
@@ -358,11 +397,11 @@ REGISTER_OP_XPU_KERNEL(
     ops::XPUActivationGradKernel<
         ops::XPUTanhGradFunctor<paddle::platform::float16>>);
 
+REGISTER_OP_XPU_KERNEL(exp,
+                       ops::XPUActivationKernel<ops::XPUExpFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(log,
                        ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(pow,
                        ops::XPUActivationKernel<ops::XPUPowFunctor<float>>);
-REGISTER_OP_XPU_KERNEL(abs,
-                       ops::XPUActivationKernel<ops::XPUAbsFunctor<float>>);
 
 #endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 021f3a13ce7cf..8160368d72ad1 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -176,7 +176,7 @@ class LazyZerosNPU {
           NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
       runner_zeros.Run(stream);
       zero_tensor->check_memory_size();
-      zero_ptr = zero_tensor->data<void>();
+      zero_ptr = zero_tensor->data();
     }
 
     for (size_t i = 0; i < xs.size(); ++i) {
diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc
new file mode 100644
index 0000000000000..6fee1e8adccf1
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op_xpu.cc
@@ -0,0 +1,207 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/argsort_op.h"
+
+namespace paddle {
+namespace operators {
+
+const int XPU_SORT_MAX_SIZE = 16384;
+
+template <typename T, typename TID>
+static inline void xpu_argsort(xpu::Context* ctx, const T* input_data,
+                               T* output_data, TID* indices_data, int m, int n,
+                               bool descending) {
+  int ret =
+      xpu::sort(ctx, input_data, output_data, indices_data, m, n, descending);
+  PADDLE_ENFORCE_EQ(
+      ret, XPU_SUCCESS,
+      platform::errors::External("XPU sort kernel return wrong value[%d %s].",
+                                 ret, XPUAPIErrorMsg[ret]));
+}
+
+template <typename T>
+static inline void xpu_transpose(xpu::Context* ctx, const T* x, T* y,
+                                 const std::vector<int>& xshape,
+                                 const std::vector<int>& permute) {
+  int ret = xpu::transpose(ctx, x, y, xshape, permute);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU transpose kernel return wrong value[%d %s]", ret,
+                        XPUAPIErrorMsg[ret]));
+}
+
+template <typename TX, typename TY>
+static inline void xpu_cast(xpu::Context* ctx, const TX* x, TY* y, int len) {
+  int ret = xpu::cast_v2(ctx, x, y, len);
+  PADDLE_ENFORCE_EQ(
+      ret, XPU_SUCCESS,
+      platform::errors::External("XPU cast kernel return wrong value[%d %s]",
+                                 ret, XPUAPIErrorMsg[ret]));
+}
+
+template <typename T, bool VALUE_NEED_CAST = false,
+          bool INDEX_NEED_CAST = false>
+struct XPUArgsort {
+  void operator()(xpu::Context* ctx, const T* input_data, T* output_data,
+                  int64_t* indices_data, const std::vector<int>& data_shape,
+                  const std::vector<int>& permute, bool descending) {
+    xpu::ctx_guard RAII_GUARD(ctx);
+    int m = data_shape[0] * data_shape[2];
+    int n = data_shape[1];
+    int len = data_shape[0] * data_shape[1] * data_shape[2];
+    std::vector<int> trans_data_shape{data_shape[0], data_shape[2],
+                                      data_shape[1]};
+
+    T* input_data_trans = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    T* output_data_trans = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    int64_t* indices_data_trans = RAII_GUARD.alloc_l3_or_gm<int64_t>(len);
+
+    xpu_transpose(ctx, input_data, input_data_trans, data_shape, permute);
+    xpu_argsort(ctx, input_data_trans, output_data_trans, indices_data_trans, m,
+                n, descending);
+    xpu_transpose(ctx, output_data_trans, output_data, trans_data_shape,
+                  permute);
+    xpu_transpose(ctx, indices_data_trans, indices_data, trans_data_shape,
+                  permute);
+  }
+};
+
+template <typename T>
+struct XPUArgsort<T, false, true> {
+  void operator()(xpu::Context* ctx, const T* input_data, T* output_data,
+                  int64_t* indices_data, const std::vector<int>& data_shape,
+                  const std::vector<int>& permute, bool descending) {
+    xpu::ctx_guard RAII_GUARD(ctx);
+    int m = data_shape[0] * data_shape[2];
+    int n = data_shape[1];
+    int len = data_shape[0] * data_shape[1] * data_shape[2];
+    std::vector<int> trans_data_shape{data_shape[0], data_shape[2],
+                                      data_shape[1]};
+
+    T* input_data_trans = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    T* output_data_trans = RAII_GUARD.alloc_l3_or_gm<T>(len);
+    int* indices_data_trans = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int64_t* cast_data_int64 = RAII_GUARD.alloc_l3_or_gm<int64_t>(len);
+
+    xpu_transpose(ctx, input_data, input_data_trans, data_shape, permute);
+    xpu_argsort(ctx, input_data_trans, output_data_trans, indices_data_trans, m,
+                n, descending);
+    xpu_transpose(ctx, output_data_trans, output_data, trans_data_shape,
+                  permute);
+    xpu_cast(ctx, indices_data_trans, cast_data_int64, len);
+    xpu_transpose(ctx, cast_data_int64, indices_data, trans_data_shape,
+                  permute);
+  }
+};
+
+template <>
+struct XPUArgsort<int64_t, true, true> {
+  void operator()(xpu::Context* ctx, const int64_t* input_data,
+                  int64_t* output_data, int64_t* indices_data,
+                  const std::vector<int>& data_shape,
+                  const std::vector<int>& permute, bool descending) {
+    xpu::ctx_guard RAII_GUARD(ctx);
+    int m = data_shape[0] * data_shape[2];
+    int n = data_shape[1];
+    int len = data_shape[0] * data_shape[1] * data_shape[2];
+    std::vector<int> trans_data_shape{data_shape[0], data_shape[2],
+                                      data_shape[1]};
+
+    int* input_data_trans = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int* output_data_trans = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int* indices_data_trans = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int* cast_data_int = RAII_GUARD.alloc_l3_or_gm<int>(len);
+    int64_t* cast_data_int64 = RAII_GUARD.alloc_l3_or_gm<int64_t>(len);
+
+    xpu_cast(ctx, input_data, cast_data_int, len);
+    xpu_transpose(ctx, cast_data_int, input_data_trans, data_shape, permute);
+    xpu_argsort(ctx, input_data_trans, output_data_trans, indices_data_trans, m,
+                n, descending);
+
+    xpu_cast(ctx, output_data_trans, cast_data_int64, len);
+    xpu_transpose(ctx, cast_data_int64, output_data, trans_data_shape, permute);
+    xpu_cast(ctx, indices_data_trans, cast_data_int64, len);
+    xpu_transpose(ctx, cast_data_int64, indices_data, trans_data_shape,
+                  permute);
+  }
+};
+
+template <typename T>
+class ArgsortXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    int axis = ctx.Attr<int>("axis");
+    bool descending = ctx.Attr<bool>("descending");
+
+    auto in_dims = input->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+    int n = in_dims[axis];
+
+    PADDLE_ENFORCE_LT(
+        n, XPU_SORT_MAX_SIZE,
+        platform::errors::InvalidArgument(
+            "The axis dimension of Input should less than %d, but got %d.",
+            XPU_SORT_MAX_SIZE, in_dims[axis]));
+
+    auto input_data = input->data<T>();
+    auto output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    int len_before =
+        framework::product(framework::slice_ddim(in_dims, 0, axis));
+    int len_after = framework::product(
+        framework::slice_ddim(in_dims, axis + 1, in_dims.size()));
+    bool int64_need_cast =
+        (std::is_same<T, int64_t>::value && n > (XPU_SORT_MAX_SIZE / 2))
+            ? true
+            : false;
+    bool index_need_cast = (n > (XPU_SORT_MAX_SIZE / 2)) ? true : false;
+    std::vector<int> permute_vec{0, 2, 1};
+    std::vector<int> data_shape{len_before, n, len_after};
+
+    if (int64_need_cast) {
+      XPUArgsort<T, true, true>()(dev_ctx.x_context(), input_data, output_data,
+                                  indices_data, data_shape, permute_vec,
+                                  descending);
+    } else if (index_need_cast) {
+      XPUArgsort<T, false, true>()(dev_ctx.x_context(), input_data, output_data,
+                                   indices_data, data_shape, permute_vec,
+                                   descending);
+    } else {
+      XPUArgsort<T, false, false>()(dev_ctx.x_context(), input_data,
+                                    output_data, indices_data, data_shape,
+                                    permute_vec, descending);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(argsort, ops::ArgsortXPUKernel<float>,
+                       ops::ArgsortXPUKernel<int>,
+                       ops::ArgsortXPUKernel<int64_t>);
+
+#endif
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index 8499d1cdcd646..d232891f3d684 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include <iterator>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -25,23 +27,25 @@ using DDim = framework::DDim;
 template <typename DeviceContext, typename T>
 class BatchNormXPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
     const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto momentum = ctx.Attr<float>("momentum");
+    float momentum = ctx.Attr<float>("momentum");
     const auto is_test = ctx.Attr<bool>("is_test");
     const auto use_global_stats = ctx.Attr<bool>("use_global_stats");
     const auto trainable_stats = ctx.Attr<bool>("trainable_statistics");
     bool test_mode = is_test && (!trainable_stats);
+
     bool global_stats = test_mode || use_global_stats;
-    const auto& data_layout_str = ctx.Attr<std::string>("data_layout");
+    const auto &data_layout_str = ctx.Attr<std::string>("data_layout");
     const auto data_layout = framework::StringToDataLayout(data_layout_str);
     PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW,
                       platform::errors::InvalidArgument(
                           "The 'data_layout' attribute must be NCHW. But "
                           "recevived 'data_layout' is [%s].",
                           data_layout_str));
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto& x_dims = x->dims();
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
     PADDLE_ENFORCE_EQ(x_dims.size(), 4,
                       platform::errors::InvalidArgument(
                           "The input tensor X's dimension must equal to 4. But "
@@ -51,27 +55,42 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
     const int C = x_dims[1];
     const int H = x_dims[2];
     const int W = x_dims[3];
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* bias = ctx.Input<Tensor>("Bias");
-    const auto* x_data = x->data<T>();
-    const auto* scale_data = scale->data<T>();
-    const auto* bias_data = bias->data<T>();
-    auto* y = ctx.Output<Tensor>("Y");
-    auto* y_data = y->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *x_data = x->data<T>();
+    const auto *scale_data = scale->data<float>();
+    const auto *bias_data = bias->data<float>();
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+
+    // alloc memory
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<float>(ctx.GetPlace());
+    variance_out->mutable_data<float>(ctx.GetPlace());
+    saved_mean->mutable_data<float>(ctx.GetPlace());
+    saved_variance->mutable_data<float>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+
     if (!global_stats) {
-      auto* mean_out = ctx.Output<Tensor>("MeanOut");
-      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
-      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
-      mean_out->mutable_data<T>(ctx.GetPlace());
-      variance_out->mutable_data<T>(ctx.GetPlace());
-      saved_mean->mutable_data<T>(ctx.GetPlace());
-      saved_variance->mutable_data<T>(ctx.GetPlace());
-      auto* mean_out_data = mean_out->data<T>();
-      auto* variance_out_data = variance_out->data<T>();
-      auto* saved_mean_data = saved_mean->data<T>();
-      auto* saved_variance_data = saved_variance->data<T>();
+      auto *mean_out_data = mean_out->data<float>();
+      auto *variance_out_data = variance_out->data<float>();
+      auto *saved_mean_data = saved_mean->data<float>();
+      auto *saved_variance_data = saved_variance->data<float>();
+
+      // if MomentumTensor is set, use MomentumTensor value, momentum
+      // is only used in this training branch
+      if (ctx.HasInput("MomentumTensor")) {
+        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+        Tensor mom_cpu;
+        TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
+        momentum = mom_tensor->data<float>()[0];
+      }
+
       int r = xpu::batch_norm<T>(dev_ctx.x_context(), x_data, y_data, N, C, H,
                                  W, epsilon, momentum, scale_data, bias_data,
                                  saved_mean_data, saved_variance_data,
@@ -81,12 +100,10 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
                             "The batch_norm XPU API return wrong value[%d %s]",
                             r, XPUAPIErrorMsg[r]));
     } else {
-      const auto* mean = ctx.Input<Tensor>("Mean");
-      const auto* variance = ctx.Input<Tensor>("Variance");
-      const auto* mean_data = mean->data<float>();
-      const auto* variance_data = variance->data<float>();
-      const auto* x_data = x->data<float>();
-      auto* y_data = y->mutable_data<float>(ctx.GetPlace());
+      const auto *mean = ctx.Input<Tensor>("Mean");
+      const auto *variance = ctx.Input<Tensor>("Variance");
+      const auto *mean_data = mean->data<float>();
+      const auto *variance_data = variance->data<float>();
       int r = xpu::batch_norm_infer(dev_ctx.x_context(), x_data, y_data, N, C,
                                     H, W, epsilon, scale_data, bias_data,
                                     mean_data, variance_data, true);
@@ -99,24 +116,96 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+static int calculate_inv_BN_Y(xpu::Context *ctx, T *x, const T *scale,
+                              const T *bias, const T *mean, const T *variance,
+                              const int N, const int C, const int M,
+                              const T *y) {
+  PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
+                              "X and Y should be inplaced in inplace mode"));
+  std::vector<int> tensor_shape_vec({N, C, M});
+  std::vector<int> array_shape_vec({1, C, 1});
+  // y - bias
+  int r1 =
+      xpu::broadcast_sub<T>(ctx, bias, y, x, array_shape_vec, tensor_shape_vec);
+  // (y - bias) / scale
+  int r2 = xpu::broadcast_div<T>(ctx, scale, x, x, array_shape_vec,
+                                 tensor_shape_vec);
+  // (y - bias) / scale / variance
+  int r3 = xpu::broadcast_div<T>(ctx, variance, x, x, array_shape_vec,
+                                 tensor_shape_vec);
+  // (y - bias) / scale / variance + mean
+  int r4 =
+      xpu::broadcast_add<T>(ctx, mean, x, x, array_shape_vec, tensor_shape_vec);
+
+  return r1 + r2 + r3 + r4;
+}
+
+template <typename T>
+static int calculate_inv_var(xpu::Context *ctx, const T *var, const T epsilon,
+                             const int C, T *epsilon_data, T *inv_var) {
+  int r1 = constant(ctx, epsilon_data, 1, epsilon);
+  std::vector<int> tensor_shape_vec({C});
+  std::vector<int> array_shape_vec({1});
+  int r2 = xpu::broadcast_add<T>(ctx, epsilon_data, var, inv_var,
+                                 array_shape_vec, tensor_shape_vec);
+  int r3 = xpu::rsqrt<T>(ctx, inv_var, inv_var, C);
+  return r1 + r2 + r3;
+}
+
 template <typename DeviceContext, typename T>
 class BatchNormGradXPUKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto* scale = ctx.Input<Tensor>("Scale");
-    const auto* saved_mean = ctx.Input<Tensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto* saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto& data_layout_str = ctx.Attr<std::string>("data_layout");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    const auto &data_layout_str = ctx.Attr<std::string>("data_layout");
+    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const float epsilon = ctx.Attr<float>("epsilon");
     const auto data_layout = framework::StringToDataLayout(data_layout_str);
+
+    // TODO(guozbin): Transform input tensor from NHWC to NCHW
     PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW,
                       platform::errors::InvalidArgument(
                           "The 'data_layout' attribute must be NCHW. But "
                           "recevived 'data_layout' is [%s].",
                           data_layout_str));
-    const auto& x_dims = x->dims();
+
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    use_global_stats = is_test || use_global_stats;
+
+    // batch_norm with inplace as false will take X as grad input, which
+    // is same as cuDNN batch_norm backward calculation, batch_norm
+    // with inplace as true only take Y as input and X should be calculate
+    // by inverse operation of batch_norm on Y
+    const Tensor *x;
+    bool is_inplace;
+    if (ctx.HasInput("Y")) {
+      x = ctx.Input<Tensor>("Y");
+      is_inplace = true;
+      // if the input of batch norm is stop_gradient, d_x is null.
+      if (d_x) {
+        PADDLE_ENFORCE_EQ(d_x, d_y,
+                          platform::errors::InvalidArgument(
+                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
+      }
+    } else {
+      x = ctx.Input<Tensor>("X");
+      is_inplace = false;
+      if (d_x) {
+        PADDLE_ENFORCE_NE(
+            d_x, d_y, platform::errors::InvalidArgument(
+                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+      }
+    }
+
+    const auto &x_dims = x->dims();
     PADDLE_ENFORCE_EQ(x_dims.size(), 4,
                       platform::errors::InvalidArgument(
                           "The input tensor X's dimension must equal to 4. But "
@@ -126,26 +215,96 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     const int C = x_dims[1];
     const int H = x_dims[2];
     const int W = x_dims[3];
-    const auto* x_data = x->data<T>();
-    const auto* dy_data = dy->data<T>();
-    const auto* scale_data = scale->data<T>();
-    const auto* saved_mean_data = saved_mean->data<T>();
-    const auto* saved_inv_variance_data = saved_inv_variance->data<T>();
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dscale_data = dscale->mutable_data<T>(ctx.GetPlace());
-    auto* dbias_data = dbias->mutable_data<T>(ctx.GetPlace());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r = xpu::batch_norm_grad<T>(dev_ctx.x_context(), x_data, dy_data,
-                                    dx_data, N, C, H, W, scale_data,
-                                    saved_mean_data, saved_inv_variance_data,
-                                    dscale_data, dbias_data, true);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                          "XPU API(batch_norm_grad) return "
-                                          "wrong value[%d %s]",
-                                          r, XPUAPIErrorMsg[r]));
+
+    const auto *x_data = x->data<T>();
+    const auto *d_y_data = d_y->data<T>();
+    const auto *scale_data = scale->data<float>();
+
+    // init output
+    T *d_x_data = nullptr;
+    T *d_bias_data = nullptr;
+    T *d_scale_data = nullptr;
+    if (d_x) {
+      d_x_data = d_x->mutable_data<T>(ctx.GetPlace());
+    }
+    if (d_scale && d_bias) {
+      d_scale_data = d_scale->mutable_data<float>(ctx.GetPlace());
+      d_bias_data = d_bias->mutable_data<float>(ctx.GetPlace());
+    }
+
+    PADDLE_ENFORCE_EQ(
+        scale->dims().size(), 1UL,
+        platform::errors::InvalidArgument(
+            "The size of scale's dimensions must equal to 1. But received: "
+            "the size of scale's dimensions is [%d], the dimensions of scale "
+            "is [%s].",
+            scale->dims().size(), scale->dims()));
+    PADDLE_ENFORCE_EQ(
+        scale->dims()[0], C,
+        platform::errors::InvalidArgument(
+            "The first dimension of scale must equal to Channels[%d]. But "
+            "received: the first dimension of scale is [%d]",
+            C, scale->dims()[0]));
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+
+    const T *mean_data = nullptr;
+    const T *inv_var_data = nullptr;
+
+    // TODO(guozibin): hadle the situation case of N * H * W = 1
+    if (!use_global_stats) {
+      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+      // SavedVariance have been reverted in forward operator
+      const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
+      mean_data = saved_mean->data<float>();
+      inv_var_data = saved_inv_variance->data<float>();
+    } else {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<float>();
+      inv_var_data = running_variance->data<float>();
+      float *running_inv_var_data =
+          RAII_GUARD.alloc_l3_or_gm<float>(running_variance->numel());
+      float *epsilon_data = RAII_GUARD.alloc_l3_or_gm<float>(1);
+      int r1 = calculate_inv_var(dev_ctx.x_context(), inv_var_data, epsilon, C,
+                                 epsilon_data, running_inv_var_data);
+      PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
+                                             "XPU API(batch_norm_grad "
+                                             "calculate_inv_var function) "
+                                             "return wrong value[%d %s]",
+                                             r1, XPUAPIErrorMsg[r1]));
+      inv_var_data = running_inv_var_data;
+    }
+    if (is_inplace) {
+      auto px = *x;
+      int r2 = calculate_inv_BN_Y(
+          dev_ctx.x_context(), px.mutable_data<T>(ctx.GetPlace()),
+          scale->data<float>(), bias->data<float>(), mean_data, inv_var_data, N,
+          C, H * W, x->data<T>());
+      PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External(
+                                             "XPU API(batch_norm_grad "
+                                             "calculate_inv_BN_Y function) "
+                                             "return wrong value[%d %s]",
+                                             r2, XPUAPIErrorMsg[r2]));
+    }
+    if (!d_x) {
+      d_x_data = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
+    }
+    if (!d_scale) {
+      d_scale_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
+    }
+    if (!d_bias_data) {
+      d_bias_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
+    }
+
+    int r3 = xpu::batch_norm_grad<T>(
+        dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, scale_data,
+        mean_data, inv_var_data, d_scale_data, d_bias_data, true);
+    PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External(
+                                           "XPU API(batch_norm_grad) return "
+                                           "wrong value[%d %s]",
+                                           r3, XPUAPIErrorMsg[r3]));
   }
 };
 
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 73f73a81c088e..18562b243255b 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/operators/bce_loss_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -23,6 +24,17 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+template <typename T>
+struct BCELossGradFunctor {
+  T one = static_cast<T>(1.0f);
+  T eps = static_cast<T>(1e-12);
+  __device__ __forceinline__ T operator()(const T& x, const T& label,
+                                          const T& dout) const {
+    T term1 = max((one - x) * x, eps);
+    return (dout * (x - label) / term1);
+  }
+};
+
 template <typename T>
 __global__ void GPUBCELossForward(const T* x_data, const T* label_data,
                                   T* out_data, const int in_numel) {
@@ -44,23 +56,6 @@ __global__ void GPUBCELossForward(const T* x_data, const T* label_data,
   }
 }
 
-template <typename T>
-__global__ void GPUBCELossBackward(const T* x_data, const T* label_data,
-                                   const T* dout_data, T* dx_data,
-                                   const int in_numel) {
-  CUDA_KERNEL_LOOP(i, in_numel) {
-    T x = x_data[i];
-    T label = label_data[i];
-    T dout = dout_data[i];
-    T one = static_cast<T>(1.);
-    T eps = static_cast<T>(1e-12);
-
-    T term1 = max((one - x) * x, eps);
-
-    dx_data[i] = dout * (x - label) / term1;
-  }
-}
-
 template <typename DeviceContext, typename T>
 class BCELossCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -91,17 +86,13 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
     auto* labels = ctx.Input<Tensor>("Label");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    int x_numel = x->numel();
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.cuda_device_context();
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
-
-    GPUBCELossBackward<T><<<config.block_per_grid, config.thread_per_block, 0,
-                            dev_ctx.stream()>>>(
-        x->data<T>(), labels->data<T>(), dout->data<T>(), dx_data, x_numel);
+    dx->mutable_data<T>(ctx.GetPlace());
+    std::vector<const framework::Tensor*> ins = {x, labels, dout};
+    std::vector<framework::Tensor*> outs = {dx};
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto functor = BCELossGradFunctor<T>();
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kTernary, T, T>(
+        dev_ctx, ins, &outs, functor);
   }
 };
 
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 7278d80ce9ba1..4853e5324c30f 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -105,6 +105,11 @@ class CastOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(tensor->type(), tensor_place);
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::KernelSignature("cast", {"X"}, {"out_dtype"}, {"Out"});
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index bf0e81a23bf90..72aa9a195ec7c 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -59,8 +59,6 @@ class CastOpKernel : public framework::OpKernel<InT> {
     auto* out = context.Output<framework::Tensor>("Out");
 
     auto out_dtype = context.Attr<int>("out_dtype");
-    // todo: not used in_dtype
-    auto in_dtype = context.Attr<int>("in_dtype");
 
     auto& dev_ctx = context.device_context<DeviceContext>();
     out->mutable_data(dev_ctx.GetPlace(),
@@ -71,12 +69,9 @@ class CastOpKernel : public framework::OpKernel<InT> {
 
     auto pt_out_dtype = pten::TransToPtenDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
-    auto pt_in_dtype = pten::TransToPtenDataType(
-        static_cast<framework::proto::VarType::Type>(in_dtype));
 
     // call new kernel
-    pten::Cast<InT>(dev_ctx, *pt_x.get(), pt_out_dtype, pt_in_dtype,
-                    pt_out.get());
+    pten::CastKernel<InT>(dev_ctx, *pt_x.get(), pt_out_dtype, pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc
new file mode 100644
index 0000000000000..577176e1ffc48
--- /dev/null
+++ b/paddle/fluid/operators/cholesky_solve_op.cc
@@ -0,0 +1,172 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cholesky_solve_op.h"
+#include "paddle/fluid/operators/solve_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddComment(R"DOC(Solves a linear system of equations with a positive "
+                "semidefinite matrix to be inverted given its Cholesky factor matrix uu."
+                ")DOC");
+    AddInput("X", "(Tensor) The input tensor, shape of (*,m,k)");
+    AddInput("Y",
+             "(Tensor) The input tensor, shape of (*,m,m) composed of upper or "
+             "lower triangular Cholesky factor");
+    AddOutput("Out", "(Tensor) The output tensor, shape same to X");
+    AddAttr<bool>("upper",
+                  "whether to consider the Cholesky factor "
+                  "as a lower or upper triangular matrix")
+        .SetDefault(false);
+  }
+};
+
+class CholeskySolveOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve");
+    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve");
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve");
+    auto u_dims = context->GetInputDim("Y");
+    auto b_dims = context->GetInputDim("X");
+    int u_rank = u_dims.size();
+    int b_rank = b_dims.size();
+    PADDLE_ENFORCE_GE(u_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "the rank of input Y must greater or equal to 2"));
+    PADDLE_ENFORCE_GE(b_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "the rank of input X must greater or equal to 2"));
+    PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2],
+                      platform::errors::InvalidArgument(
+                          "input Matrix Y should be square matrix,"
+                          "But Got last shape of %ld x %ld",
+                          u_dims[u_rank - 1], u_dims[u_rank - 2]));
+    PADDLE_ENFORCE_EQ(
+        b_dims[b_rank - 2], u_dims[u_rank - 2],
+        platform::errors::InvalidArgument(
+            "the first dim of input X must equal to the dim of input Y,"
+            "But Got %ld and %ld",
+            b_dims[b_rank - 2], u_dims[u_rank - 2]));
+
+    std::vector<int64_t> u_dims_vec = paddle::framework::vectorize(u_dims);
+    std::vector<int64_t> b_dims_vec = paddle::framework::vectorize(b_dims);
+
+    std::vector<int64_t> u_dims_vec_cut(u_dims_vec.begin(),
+                                        u_dims_vec.end() - 2);
+    std::vector<int64_t> b_dims_vec_cut(b_dims_vec.begin(),
+                                        b_dims_vec.end() - 2);
+
+    std::vector<int64_t> expand_batch_portion =
+        get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut);
+
+    std::vector<int64_t> b_broadcast_dims({expand_batch_portion});
+    b_broadcast_dims.insert(b_broadcast_dims.end(),
+                            {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]});
+
+    // dim of 'Out' is the same with 'Y' after broadcast
+    context->SetOutputDim("Out", framework::make_ddim(b_broadcast_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Y"), ctx.GetPlace());
+  }
+};
+
+class CholeskySolveOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto var_type = ctx->GetInputType("Y", 0);
+    auto data_type = ctx->GetInputDataType("Y", 0);
+
+    ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS);
+  }
+};
+
+template <typename T>
+class CholeskySolveOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("cholesky_solve_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Y", this->Input("Y"));
+    retv->SetInput("Out", this->Output("Out"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CholeskySolveGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "cholesky_solve");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "cholesky_solve");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "cholesky_solve");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "cholesky_solve");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp,
+                  ops::CholeskySolveOpMaker,
+                  ops::CholeskySolveOpVarTypeInference,
+                  ops::CholeskySolveOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CholeskySolveOpGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    cholesky_solve,
+    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CholeskySolveKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    cholesky_solve_grad,
+    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CholeskySolveGradKernel<paddle::platform::CPUDeviceContext, double>);
+// Complex<> is not supported because of TensorExpand, which used to boardcast
+// input Tensor
diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu
new file mode 100644
index 0000000000000..f42364c9619ef
--- /dev/null
+++ b/paddle/fluid/operators/cholesky_solve_op.cu
@@ -0,0 +1,136 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/cholesky_solve_op.h"
+#include "paddle/fluid/platform/dynload/cusolver.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo,
+                    int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb,
+                    int *devInfo);
+
+template <>
+void cusolver_potrs<float>(const cusolverDnHandle_t &cusolverH,
+                           cublasFillMode_t uplo, int n, int nrhs, float *Adata,
+                           int lda, float *Bdata, int ldb, int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs(
+      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<double>(const cusolverDnHandle_t &cusolverH,
+                            cublasFillMode_t uplo, int n, int nrhs,
+                            double *Adata, int lda, double *Bdata, int ldb,
+                            int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs(
+      cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<platform::complex<float>>(
+    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
+    platform::complex<float> *Adata, int lda, platform::complex<float> *Bdata,
+    int ldb, int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs(
+      cusolverH, uplo, n, nrhs, reinterpret_cast<const cuComplex *>(Adata), lda,
+      reinterpret_cast<cuComplex *>(Bdata), ldb, devInfo));
+}
+
+template <>
+void cusolver_potrs<platform::complex<double>>(
+    const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs,
+    platform::complex<double> *Adata, int lda, platform::complex<double> *Bdata,
+    int ldb, int *devInfo) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs(
+      cusolverH, uplo, n, nrhs,
+      reinterpret_cast<const cuDoubleComplex *>(Adata), lda,
+      reinterpret_cast<cuDoubleComplex *>(Bdata), ldb, devInfo));
+}
+
+template <typename T>
+class CholeskySolveFunctor<paddle::platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n,
+                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
+    cublasFillMode_t uplo =
+        upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
+
+    /* step 1: get cusolver handle*/
+    auto cusolverH = dev_ctx.cusolver_dn_handle();
+
+    /* step 2: solve A0*X0 = B0  */
+    cusolver_potrs<T>(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda,
+                      devInfo);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+  }
+};
+
+template <typename T>
+class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const Tensor &in, Tensor *out,
+                  const framework::ExecutionContext &ctx) {
+    // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
+    // out_reduce_dim should be [0, 2]
+    const std::vector<std::int64_t> in_dims = framework::vectorize(in.dims());
+    auto in_size = in_dims.size();
+    const std::vector<std::int64_t> out_dims =
+        framework::vectorize(out->dims());
+    auto out_size = out_dims.size();
+
+    std::vector<std::int64_t> out_bst_dims(in_size);
+
+    std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1);
+    std::copy(out_dims.data(), out_dims.data() + out_size,
+              out_bst_dims.data() + in_size - out_size);
+
+    std::vector<int> out_reduce_dims;
+    for (size_t idx = 0; idx <= in_size - 3; idx++) {
+      if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) {
+        out_reduce_dims.push_back(idx);
+      }
+    }
+    gpuStream_t stream = ctx.cuda_device_context().stream();
+    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        in, out, kps::IdentityFunctor<T>(), out_reduce_dims, stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    cholesky_solve,
+    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CholeskySolveKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    cholesky_solve_grad,
+    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CholeskySolveGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
new file mode 100644
index 0000000000000..94b68bff8f446
--- /dev/null
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -0,0 +1,247 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/lapack_function.h"
+#include "paddle/fluid/operators/solve_op.h"
+#include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/fluid/operators/triangular_solve_op.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/kernels/math_kernel.h"
+
+namespace paddle {
+namespace operators {  // namespace operators
+
+template <typename DeviceContext, typename T>
+class CholeskySolveFunctor {
+ public:
+  void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n,
+                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo);
+};
+
+template <typename T>
+class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n,
+                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
+    char uplo = upper ? 'U' : 'L';
+    math::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
+                                 devInfo);
+  }
+};
+
+template <typename DeviceContext, typename T>
+void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx,
+                       const framework::Tensor &uin,
+                       const framework::Tensor &bin, framework::Tensor *out,
+                       bool upper) {
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+  // framework::Tensor broadcast
+  std::vector<int64_t> u_bst_dims_vec;
+  std::vector<int64_t> b_bst_dims_vec;
+  std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin);
+  framework::Tensor u_bst(uin.type());
+  TensorExpand<T, DeviceContext>(dev_ctx, uin, &u_bst, u_bst_dims_vec);
+
+  framework::Tensor b_bst(bin.type());
+  TensorExpand<T, DeviceContext>(dev_ctx, bin, &b_bst, b_bst_dims_vec);
+
+  math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
+
+  // calculate u's conjugate for complex
+  framework::Tensor u_conj(u_bst.type());
+  platform::ForRange<DeviceContext> u_for_range(dev_ctx, u_bst.numel());
+  math::ConjFunctor<T> u_functor(
+      u_bst.data<T>(), u_bst.numel(),
+      u_conj.mutable_data<T>(u_bst.dims(), dev_ctx.GetPlace()));
+  u_for_range(u_functor);
+  u_conj = helper.Transpose(u_conj);
+
+  // calculate b's conjugate for complex
+  framework::Tensor b_conj(b_bst.type());
+  platform::ForRange<DeviceContext> b_for_range(dev_ctx, b_bst.numel());
+  math::ConjFunctor<T> b_functor(
+      b_bst.data<T>(), b_bst.numel(),
+      b_conj.mutable_data<T>(b_bst.dims(), dev_ctx.GetPlace()));
+  b_for_range(b_functor);
+  b_conj = helper.Transpose(b_conj);
+
+  auto ut_data = u_conj.mutable_data<T>(dev_ctx.GetPlace());
+  auto uindims = u_bst.dims();
+  auto bindims = b_bst.dims();
+  int uinrank = uindims.size();
+  int binrank = bindims.size();
+
+  int n = uindims[uinrank - 2];
+  int nrhs = bindims[binrank - 1];
+  int ldab = std::max(1, n);
+
+  // framework::Tensor out_copy(b_conj.type());
+  // out_copy.Resize(b_conj.dims());
+  framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out);
+  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+
+  auto info_dims = slice_ddim(bindims, 0, binrank - 2);
+  auto batchsize = product(info_dims);
+
+  framework::Tensor tmp;
+  std::vector<int> tmpdim(1, batchsize);
+  tmp.Resize(framework::make_ddim(tmpdim));
+  int *info = tmp.mutable_data<int>(dev_ctx.GetPlace());
+
+  CholeskySolveFunctor<DeviceContext, T> functor;
+  for (int b = 0; b < batchsize; b++) {
+    auto uin_data_item = &ut_data[b * n * n];
+    auto out_data_item = &out_data[b * n * nrhs];
+    auto info_item = &info[b];
+    functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item,
+            info_item);
+  }
+
+  // calculate out's conjugate for complex
+  platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
+  math::ConjFunctor<T> out_functor(
+      out->data<T>(), out->numel(),
+      out->mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
+  out_for_range(out_functor);
+  *out = helper.Transpose(*out);
+}
+
+template <typename DeviceContext, typename T>
+class CholeskySolveKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto *uin = ctx.Input<framework::Tensor>("Y");
+    auto *bin = ctx.Input<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    auto upper = ctx.Attr<bool>("upper");
+    cholesky_solve_fn<DeviceContext, T>(ctx, *uin, *bin, out, upper);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CholeskySolveGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *bin = ctx.Input<framework::Tensor>("X");
+    auto *uin = ctx.Input<framework::Tensor>("Y");
+    auto *out = ctx.Input<framework::Tensor>("Out");
+    auto *dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *db = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *du = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto upper = ctx.Attr<bool>("upper");
+
+    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
+
+    std::vector<int64_t> u_bst_dims_vec;
+    std::vector<int64_t> b_bst_dims_vec;
+    std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin);
+    framework::Tensor u_bst(uin->type());
+    TensorExpand<T, DeviceContext>(dev_ctx, *uin, &u_bst, u_bst_dims_vec);
+
+    framework::Tensor db_bst(bin->type());
+    TensorExpand<T, DeviceContext>(dev_ctx, *bin, &db_bst, b_bst_dims_vec);
+
+    if (dout) {
+      db->mutable_data<T>(dev_ctx.GetPlace());
+      cholesky_solve_fn<DeviceContext, T>(ctx, u_bst, *dout, &db_bst, upper);
+
+      if (db_bst.dims() == db->dims()) {
+        framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db);
+      } else {
+        MatrixReduceSumFunctor<DeviceContext, T> functor;
+        functor(db_bst, db, ctx);
+        db->Resize(bin->dims());
+      }
+
+      auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+      // calculate out's conjugate for complex
+      framework::Tensor out_conj(out->type());
+      platform::ForRange<DeviceContext> out_for_range(dev_ctx, out->numel());
+      math::ConjFunctor<T> out_functor(
+          out->data<T>(), out->numel(),
+          out_conj.mutable_data<T>(out->dims(), dev_ctx.GetPlace()));
+      out_for_range(out_functor);
+      out_conj = helper.Transpose(out_conj);
+
+      framework::Tensor commonterm(out->type());
+      auto outdims = out_conj.dims();
+      auto dbdims = db_bst.dims();
+      auto mat_dim_a = math::CreateMatrixDescriptor(outdims, 0, false);
+      auto mat_dim_b = math::CreateMatrixDescriptor(dbdims, 0, false);
+      auto cmtdim = outdims;
+      cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2];
+      commonterm.Resize(cmtdim);
+      commonterm.mutable_data<T>(dev_ctx.GetPlace());
+      blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast<T>(1),
+                  &commonterm, static_cast<T>(0));
+
+      // calculate commonterm's conjugate for complex
+      framework::Tensor commonterm_conj(commonterm.type());
+      platform::ForRange<DeviceContext> commonterm_for_range(
+          dev_ctx, commonterm.numel());
+      math::ConjFunctor<T> commonterm_functor(
+          commonterm.data<T>(), commonterm.numel(),
+          commonterm_conj.mutable_data<T>(commonterm.dims(),
+                                          dev_ctx.GetPlace()));
+      commonterm_for_range(commonterm_functor);
+      commonterm_conj = helper.Transpose(commonterm_conj);
+
+      auto pt_x = paddle::experimental::MakePtenDenseTensor(commonterm);
+      auto pt_y = paddle::experimental::MakePtenDenseTensor(commonterm_conj);
+      auto pt_z = paddle::experimental::MakePtenDenseTensor(commonterm);
+      pten::AddKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), -1, pt_z.get());
+
+      auto mat_dim_u = math::CreateMatrixDescriptor(u_bst.dims(), 0, false);
+      auto mat_dim_c =
+          math::CreateMatrixDescriptor(commonterm.dims(), 0, false);
+
+      Tensor du_bst(uin->type());
+      // get upper or lower triangular
+      du_bst.Resize(u_bst.dims());
+      du_bst.mutable_data<T>(dev_ctx.GetPlace());
+      if (upper) {
+        blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast<T>(-1),
+                    &du_bst, static_cast<T>(0));
+      } else {
+        blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast<T>(-1),
+                    &du_bst, static_cast<T>(0));
+      }
+
+      const auto &udims = u_bst.dims();
+      const auto H = udims[udims.size() - 2];
+      const auto W = udims[udims.size() - 1];
+      platform::ForRange<DeviceContext> x_for_range(dev_ctx, u_bst.numel());
+      TrilTriuCompute<T> tril_triu_computer(du_bst.data<T>(), 0, !upper, H, W,
+                                            u_bst.data<T>());
+      x_for_range(tril_triu_computer);
+
+      du->mutable_data<T>(dev_ctx.GetPlace());
+      if (u_bst.dims() == du->dims()) {
+        framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du);
+      } else {
+        MatrixReduceSumFunctor<DeviceContext, T> functor;
+        functor(u_bst, du, ctx);
+        du->Resize(uin->dims());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index f1ccbc913d9b1..fad74b81e14e4 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -397,7 +397,9 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                        (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) +
                    1) *
                   vec_size;
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(rank);
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
     if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
       auto seed_offset = gen_cuda->IncrementOffset(offset);
       seed_data = seed_offset.first;
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index abf721936b41e..f08a7b2d57314 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -18,6 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/transform.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -25,17 +28,6 @@ namespace operators {
 using framework::Tensor;
 using platform::Transform;
 
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T, typename UnaryOperation>
-__global__ void ClipCudaKernel(const T* input, T* out, int num,
-                               UnaryOperation op) {
-  int idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < num) {
-    out[idx] = op(input[idx]);
-  }
-}
-#endif
-
 template <typename T>
 class ClipFunctor {
  public:
@@ -95,7 +87,7 @@ class ClipKernel : public framework::OpKernel<T> {
                       platform::errors::InvalidArgument(
                           "max should be greater than or equal to min. "
                           "But received min = %f, max = %f",
-                          min, max));
+                          static_cast<float>(min), static_cast<float>(max)));
 
     auto* x_var = context.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
@@ -106,12 +98,12 @@ class ClipKernel : public framework::OpKernel<T> {
       int64_t numel = x->numel();
       if (platform::is_gpu_place(context.GetPlace())) {
 #if defined(__NVCC__) || defined(__HIPCC__)
-        int threads = 256;
-        int blocks = (numel + threads - 1) / threads;
-        ClipCudaKernel<T, ClipFunctor<T>><<<
-            blocks, threads, 0,
-            context.template device_context<platform::CUDADeviceContext>()
-                .stream()>>>(x_data, out_data, numel, ClipFunctor<T>(min, max));
+        std::vector<const framework::Tensor*> ins = {x};
+        std::vector<framework::Tensor*> outs = {out};
+        auto functor = ClipFunctor<T>(min, max);
+        LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+            context.template device_context<platform::CUDADeviceContext>(), ins,
+            &outs, functor);
 #endif
       } else {
         Transform<DeviceContext> trans;
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 752e5dc4a8772..5655fd25ec24b 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -260,8 +260,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
                       size_of_dtype
                 : len;
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
-         << " address: " << out_tensors[i]->data<void>() << " len: " << len
-         << ", ";
+         << " address: " << out_tensors[i]->data() << " len: " << len << ", ";
       offset += len;
     }
     PADDLE_ENFORCE_EQ(
@@ -300,9 +299,8 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
                                     place, align_size) /
                     size_of_dtype
               : static_cast<size_t>(size);
-      const void *ptr = lod_tensors[i]->IsInitialized()
-                            ? lod_tensors[i]->data<void>()
-                            : nullptr;
+      const void *ptr =
+          lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
       VLOG(4) << size << " " << len;
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h
index 4e6d86d49e863..226b2c5132318 100644
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -43,7 +43,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
 
     int dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
-    auto* sendbuff = in->data<void>();
+    auto* sendbuff = in->data();
     out->Resize(in->dims());
     void* recvbuff = out->mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index c9aef237699f3..a98a0bf6ab4a9 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -33,7 +33,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
+    const void* sendbuff = in->data();
     void* recvbuff = out->mutable_data<T>(place);
 
     int rid = ctx.Attr<int>("ring_id");
diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc
index daaaf8b7a2e41..229d42e64e4e5 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -46,7 +46,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
             "because this op can only be an In-Place operation."));
     void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
     PADDLE_ENFORCE_EQ(
-        send_recv_buffer, in->data<void>(),
+        send_recv_buffer, in->data(),
         platform::errors::PreconditionNotMet("Currently, the broadcast op can "
                                              "only be an In-Place operation."));
 
diff --git a/paddle/fluid/operators/collective/broadcast_op_xpu.cc b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
index 9cd5c5fd22cc3..e8566803aecfa 100644
--- a/paddle/fluid/operators/collective/broadcast_op_xpu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
@@ -52,7 +52,7 @@ class BKCLBroadcastOpKernel : public framework::OpKernel<T> {
             "because this op can only be an In-Place operation."));
     void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
     PADDLE_ENFORCE_EQ(
-        send_recv_buffer, in->data<void>(),
+        send_recv_buffer, in->data(),
         platform::errors::PreconditionNotMet("Currently, the broadcast op can "
                                              "only be an In-Place operation."));
 
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index b950339bd22be..c06b2683a6bbe 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -213,7 +213,7 @@ class CReduceOpXPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     BKCLDataType dtype = platform::ToBKCLDataType(in->type());
     int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
+    const void* sendbuff = in->data();
     out->Resize(in->dims());
     void* recvbuff = out->mutable_data<T>(place);
 
@@ -276,7 +276,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
+    const void* sendbuff = in->data();
     out->Resize(in->dims());
     void* recvbuff = out->mutable_data<T>(place);
 
diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 0b5a35f515ef0..1012e9383f607 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -20,8 +20,7 @@
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/kernels/cpu/conj_kernel.h"
-#include "paddle/pten/kernels/gpu/conj_kernel.h"
+#include "paddle/pten/kernels/complex_kernel.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index a783a619473ef..4c9727391759b 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -164,7 +164,7 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results,
       VLOG(3) << "    choose algo: " << result.algo << ", TC: " << math_type_str
               << ", time: " << result.time << " ms"
               << ", wksp = " << result.memory << ", status = " << result.status;
-      return;
+      break;
     }
   }
 }
@@ -197,7 +197,6 @@ static void SetConvMathType(const framework::ExecutionContext& ctx,
     VLOG(5) << "NOT use cudnn_tensor_op_math";
   }
 #endif
-  return;
 }
 
 struct ConvArgs {
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 566e99c357fbe..cbe78d9a25b50 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -65,7 +65,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
 
     bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
+                                          ctx.Attr<bool>("exhaustive_search"));
     bool deterministic = FLAGS_cudnn_deterministic;
     auto exhaustive_deterministic = exhaustive_search && deterministic;
     PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
@@ -386,7 +387,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
 
     bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
+                                          ctx.Attr<bool>("exhaustive_search"));
     bool deterministic = FLAGS_cudnn_deterministic;
     auto exhaustive_deterministic = exhaustive_search && deterministic;
     PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
@@ -437,7 +439,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
             ctx, input_grad, &transformed_input_grad_channel);
         // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
         // the data of input_grad to transformed_input_grad_channel.
-        if (ctx.Attr<bool>("use_addto")) {
+        if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
           TransToChannelFirst<platform::CUDADeviceContext, T>(
               ctx, input_grad, &transformed_input_grad_channel);
         }
@@ -703,15 +705,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     // MIOPEN ONLY support beta to be 0.0f
     ScalingParamType<T> beta = 0.0f;
 #else
-    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    ScalingParamType<T> beta =
+        (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) ? 1.0f : 0.0f;
 #endif
-    VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+    VLOG(4) << "Conv_grad: use_addto = "
+            << (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto"));
 
     if (input_grad) {
 // When beta is 0, it is unnecessary to reset input_grad.
 // When beta is 1, the output cannot be reset since addt strategy used.
 #ifdef PADDLE_WITH_HIP
-      if (ctx.Attr<bool>("use_addto")) {
+      if (ctx.HasAttr("use_addto") && ctx.Attr<bool>("use_addto")) {
         Tensor temp_tensor(transformed_input_grad.type());
         temp_tensor.Resize(transformed_input_grad.dims());
         T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
@@ -878,7 +882,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
 
     bool exhaustive_search =
-        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+        FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") &&
+                                          ctx.Attr<bool>("exhaustive_search"));
     bool deterministic = FLAGS_cudnn_deterministic;
     auto exhaustive_deterministic = exhaustive_search && deterministic;
     PADDLE_ENFORCE_EQ(exhaustive_deterministic, false,
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index b262f05d6b187..18c45a1a4c6c1 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -144,8 +144,8 @@ void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) {
 
 static void AppendProposals(framework::Tensor* dst, int64_t offset,
                             const framework::Tensor& src) {
-  auto* out_data = dst->data<void>();
-  auto* to_add_data = src.data<void>();
+  auto* out_data = dst->data();
+  auto* to_add_data = src.data();
   size_t size_of_t = framework::SizeOfType(src.type());
   offset *= size_of_t;
   std::memcpy(
diff --git a/paddle/fluid/operators/dirichlet_op.cc b/paddle/fluid/operators/dirichlet_op.cc
new file mode 100644
index 0000000000000..f981660165717
--- /dev/null
+++ b/paddle/fluid/operators/dirichlet_op.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dirichlet_op.h"
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+
+namespace paddle {
+namespace operators {
+template <typename T, typename UniformSamplerT, typename NormalSamplerT>
+struct GammaCPUFunctor {
+  GammaCPUFunctor(const T* alpha, T* gamma,
+                  BaseSampler<T, UniformSamplerT> uniform,
+                  BaseSampler<T, NormalSamplerT> normal)
+      : alpha_(alpha), gamma_(gamma), uniform_(uniform), normal_(normal) {}
+
+  HOST void operator()(int64_t index) {
+    auto sample = sample_gamma<T, T, UniformSamplerT, NormalSamplerT>(
+        alpha_[index], uniform_, normal_);
+    gamma_[index] = std::max(std::numeric_limits<T>::min(), sample);
+  }
+
+  const T* alpha_;
+  T* gamma_;
+  BaseSampler<T, UniformSamplerT> uniform_;
+  BaseSampler<T, NormalSamplerT> normal_;
+};
+
+template <typename T>
+struct DirichletSampler<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx, const Tensor* alpha,
+                  Tensor* out) {
+    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+
+    auto p_gen = framework::DefaultCPUGenerator();
+    auto generator = p_gen->GetCPUEngine();
+
+    auto uniform = [&generator]() -> T {
+      std::uniform_real_distribution<T> u(0.0, 1.0);
+      return u(*generator);
+    };
+    BaseSampler<T, decltype(uniform)> standard_uniform(uniform);
+
+    auto normal = [&generator]() {
+      std::normal_distribution<T> n(0.0, 1.0);
+      return n(*generator);
+    };
+    BaseSampler<T, decltype(normal)> standard_normal(normal);
+
+    // sample from K gamma distributions, where K=alpha.numel()
+    framework::Tensor gamma_samples;
+    gamma_samples.mutable_data<T>(alpha->dims(), dev_ctx.GetPlace());
+    GammaCPUFunctor<T, decltype(uniform), decltype(normal)> gamma_functor(
+        alpha->data<T>(), gamma_samples.data<T>(), standard_uniform,
+        standard_normal);
+    platform::ForRange<platform::CPUDeviceContext> for_range(dev_ctx,
+                                                             alpha->numel());
+    for_range(gamma_functor);
+
+    // normalize them into a simplex, along the last axis
+    framework::Tensor gamma_sum;
+    auto new_shape = gamma_samples.dims();
+    new_shape[new_shape.size() - 1] = 1;
+    gamma_sum.mutable_data<T>(new_shape, dev_ctx.GetPlace());
+
+    ReduceKernelFunctor<platform::CPUDeviceContext, T, SumFunctor>(
+        &gamma_samples, &gamma_sum, {new_shape.size() - 1}, true, false, ctx)
+        .template apply<T>();
+    ElementwiseComputeEx<DivFunctor<T>, platform::CPUDeviceContext, T, T>(
+        ctx, &gamma_samples, &gamma_sum, -1, DivFunctor<T>(), out);
+  }
+};
+
+class DirichletOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Alpha", "(Tensor), The dirichlet Alpha parameter");
+    AddOutput("Out", "(Tensor), The output tensor of sample");
+    AddComment(R"DOC(Sample random data from dirichlet distribution.)DOC");
+  }
+};
+
+class DirichletOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "dirichlet");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "dirichlet");
+    const auto alpha_dim = ctx->GetInputDim("Alpha");
+    PADDLE_ENFORCE_GE(alpha_dim.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: The number of dimensions of 'Alpha' "
+                          "must be greater than or euqal to 1. "
+                          "But received Alpha's dimensions = %d,",
+                          alpha_dim.size()));
+    ctx->ShareDim("Alpha", /*->*/ "Out");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(dirichlet, paddle::operators::DirichletOp,
+                             paddle::operators::DirichletOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    dirichlet,
+    paddle::operators::DirichletKernel<paddle::platform::CPUDeviceContext,
+                                       float>,
+    paddle::operators::DirichletKernel<paddle::platform::CPUDeviceContext,
+                                       double>);
diff --git a/paddle/fluid/operators/dirichlet_op.cu b/paddle/fluid/operators/dirichlet_op.cu
new file mode 100644
index 0000000000000..3e1d523ae0e15
--- /dev/null
+++ b/paddle/fluid/operators/dirichlet_op.cu
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/dirichlet_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+#include "paddle/fluid/platform/for_range.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <curand_kernel.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hiprand_kernel.h>
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+using COMPAT_RANDSTATEPHILOX4_32_10_T = curandStatePhilox4_32_10_t;
+#define COMPAT_RAND_INIT curand_init
+#define COMPAT_RAND_UNIFORM curand_uniform
+#define COMPAT_RAND_NORMAL curand_normal
+#elif defined(PADDLE_WITH_HIP)
+using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
+#define COMPAT_RAND_INIT hiprand_init
+#define COMPAT_RAND_UNIFORM hiprand_uniform
+#define COMPAT_RAND_NORMAL hiprand_normal
+#endif
+
+namespace paddle {
+namespace operators {
+template <typename T>
+struct GammaCUDAFunctor {
+  GammaCUDAFunctor(const T* alpha, T* gamma, uint64_t seed, uint64_t offset)
+      : alpha_(alpha), gamma_(gamma), seed_(seed), offset_(offset) {}
+
+  DEVICE void operator()(int64_t index) {
+    // curand initialization
+    COMPAT_RANDSTATEPHILOX4_32_10_T state;
+    COMPAT_RAND_INIT(/*seed=*/seed_, /*subsequence=*/index, /*offset=*/offset_,
+                     &state);
+
+    // sample
+    auto uniform_lambda = [&state]() { return COMPAT_RAND_UNIFORM(&state); };
+    BaseSampler<T, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
+    auto normal_lambda = [&state]() { return COMPAT_RAND_NORMAL(&state); };
+    BaseSampler<T, decltype(normal_lambda)> standard_normal(normal_lambda);
+
+    auto sample =
+        sample_gamma<T, T, decltype(uniform_lambda), decltype(normal_lambda)>(
+            alpha_[index], standard_uniform, standard_normal);
+    gamma_[index] = std::max(std::numeric_limits<T>::min(), sample);
+  }
+
+  const T* alpha_;
+  T* gamma_;
+  const uint64_t seed_;
+  const uint64_t offset_;
+};
+
+template <typename T>
+struct DirichletSampler<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor* alpha, framework::Tensor* out) {
+    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+
+    // init state, seed & offset for all threads
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    auto p_gen = framework::GetDefaultCUDAGenerator(device_id);
+    auto seed_and_offset = p_gen->IncrementOffset(10);  // hard-coded offset
+    auto seed = seed_and_offset.first;
+    auto offset = seed_and_offset.second;
+
+    // sample from K gamma distributions, where K=alpha.numel()
+    framework::Tensor gamma_samples;
+    gamma_samples.mutable_data<T>(alpha->dims(), dev_ctx.GetPlace());
+    GammaCUDAFunctor<T> gamma_functor(alpha->data<T>(), gamma_samples.data<T>(),
+                                      seed, offset);
+    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
+                                                              out->numel());
+    for_range(gamma_functor);
+
+    // normalize them into a simplex, along the last axis
+    framework::Tensor gamma_sum;
+    auto new_shape = gamma_samples.dims();
+    new_shape[new_shape.size() - 1] = 1;
+    gamma_sum.mutable_data<T>(new_shape, dev_ctx.GetPlace());
+
+    ReduceKernelFunctor<platform::CUDADeviceContext, T, SumFunctor>(
+        &gamma_samples, &gamma_sum, {new_shape.size() - 1}, true, false, ctx)
+        .template apply<T>();
+    ElementwiseComputeEx<DivFunctor<T>, platform::CUDADeviceContext, T, T>(
+        ctx, &gamma_samples, &gamma_sum, -1, DivFunctor<T>(), out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    dirichlet, ops::DirichletKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DirichletKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/dirichlet_op.h b/paddle/fluid/operators/dirichlet_op.h
new file mode 100644
index 0000000000000..540acad423aa3
--- /dev/null
+++ b/paddle/fluid/operators/dirichlet_op.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cmath>
+#include <random>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+// ROCM hcc doesn't work well with using std:: in kernel functions
+#if defined(PADDLE_WITH_CUDA)
+#define COMPAT_EXP exp
+#define COMPAT_CEIL ceil
+#define COMPAT_FLOOR floor
+#define COMPAT_LOG log
+#define COMPAT_POW pow
+#define COMPAT_SQRT sqrt
+#define COMPAT_TAN tan
+#define COMPAT_ABS abs
+#define COMPAT_LOG1P log1p
+#else
+#define COMPAT_EXP std::exp
+#define COMPAT_CEIL std::ceil
+#define COMPAT_FLOOR std::floor
+#define COMPAT_LOG std::log
+#define COMPAT_POW std::pow
+#define COMPAT_SQRT std::sqrt
+#define COMPAT_TAN std::tan
+#define COMPAT_ABS std::abs
+#define COMPAT_LOG1P std::log1p
+#endif
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+struct DirichletSampler;
+
+template <typename ScalarT, typename SamplerT>
+struct BaseSampler {
+  SamplerT sampler_;
+  HOSTDEVICE BaseSampler(const SamplerT& sampler) : sampler_(sampler) {}
+  HOSTDEVICE ScalarT sample() { return sampler_(); }
+};
+
+// `sample_gamma` is d from Numpy's distributions.c, and add support for
+//  paddle data type and code style.
+//  Source MIT licensed:
+/* Copyright 2005 Robert Kern (robert.kern@gmail.com)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+template <typename ScalarT, typename AccscalarT, typename UniformSamplerT,
+          typename NormalSamplerT>
+HOSTDEVICE ScalarT sample_gamma(
+    ScalarT alpha, BaseSampler<AccscalarT, UniformSamplerT> standard_uniform,
+    BaseSampler<AccscalarT, NormalSamplerT> standard_normal) {
+  AccscalarT scale = 1.0f;
+
+  // Boost alpha for higher acceptance probability.
+  if (alpha < 1.0f) {
+    if (alpha == 0.f) return 0.f;
+    scale *= COMPAT_POW(1 - standard_uniform.sample(), 1.0f / alpha);
+    alpha += 1.0f;
+  }
+
+  // This implements the acceptance-rejection method of Marsaglia and Tsang
+  // (2000)
+  // doi:10.1145/358407.358414
+  const AccscalarT d = alpha - 1.0f / 3.0f;
+  const AccscalarT c = 1.0f / COMPAT_SQRT(9.0f * d);
+  for (;;) {
+    AccscalarT x, y;
+    do {
+      x = standard_normal.sample();
+      y = 1.0f + c * x;
+    } while (y <= 0);
+    const AccscalarT v = y * y * y;
+    const AccscalarT u = 1 - standard_uniform.sample();
+    const AccscalarT xx = x * x;
+    if (u < 1.0f - 0.0331f * xx * xx)
+      return static_cast<ScalarT>(scale * d * v);
+    if (COMPAT_LOG(u) < 0.5f * xx + d * (1.0f - v + COMPAT_LOG(v)))
+      return static_cast<ScalarT>(scale * d * v);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class DirichletKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* alpha = ctx.Input<framework::Tensor>("Alpha");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    DirichletSampler<DeviceContext, T> sampler;
+    sampler(ctx, alpha, out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h
new file mode 100644
index 0000000000000..c6305e5ba73e8
--- /dev/null
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -0,0 +1,197 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace distribution {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct exponential_transform {
+  explicit exponential_transform(T lambda) : lambda_(lambda) {}
+
+  HOSTDEVICE inline T operator()(T val) const {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (std::is_same<T, double>::value) {
+      return static_cast<T>(-1.0) / lambda_ * log(val);
+    } else {
+      return static_cast<T>(-1.0) / lambda_ * __logf(val);
+    }
+#else
+    return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
+#endif
+  }
+
+ private:
+  T lambda_;
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+struct uniform_distribution;
+
+template <typename T>
+struct normal_distribution;
+
+#if defined(__NVCC__)
+template <>
+struct uniform_distribution<float> {
+  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand_uniform4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<double> {
+  __device__ inline double2 operator()(
+      curandStatePhilox4_32_10_t *state) const {
+    return curand_uniform2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct normal_distribution<float> {
+  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand_normal4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct normal_distribution<double> {
+  __device__ inline double2 operator()(
+      curandStatePhilox4_32_10_t *state) const {
+    return curand_normal2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+#else
+template <>
+struct uniform_distribution<float> {
+  __device__ inline float4 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_uniform4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<double> {
+  __device__ inline double2 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_uniform2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct normal_distribution<float> {
+  __device__ inline float4 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_normal4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct normal_distribution<double> {
+  __device__ inline double2 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_normal2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+#endif
+
+template <typename T, typename DistOp, typename TransformOp>
+__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset,
+                                   DistOp dist, TransformOp trans,
+                                   T *out_data) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  int32_t returns_count = DistOp::kReturnsCount;
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, offset, &state);
+#else
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx, offset, &state);
+#endif
+  size_t total_thread = gridDim.x * blockDim.x;
+  for (size_t i = idx; i < size; i += total_thread * returns_count) {
+    auto random_tuple = dist(&state);
+    for (size_t j = 0; j < returns_count; j++) {
+      size_t index = i + j * total_thread;
+      if (index < size) {
+        auto random = static_cast<T>((&random_tuple.x)[j]);
+        out_data[index] = trans(random);
+      }
+    }
+  }
+}
+
+template <typename T, typename DistOp, typename TransformOp>
+void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx,
+                                Tensor *out, DistOp dist, TransformOp trans) {
+  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+  auto size = out->numel();
+
+  int64_t device_id =
+      BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId();
+  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+  size_t block_size = 256;
+  size_t expect_grid_size = (size + block_size - 1) / block_size;
+  const auto &prop = platform::GetDeviceProperties(device_id);
+  size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
+                         prop.multiProcessorCount;
+  size_t grid_size =
+      expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
+
+  size_t total_thread = block_size * grid_size;
+  size_t curand4_loop_times =
+      (size + 4 * total_thread - 1) / (4 * total_thread);
+  // 'increment' shoulde be multiple of 4
+  uint64_t increment = curand4_loop_times * 4;
+
+  auto seed_offset = gen_cuda->IncrementOffset(increment);
+  uint64_t seed = seed_offset.first;
+  uint64_t offset = seed_offset.second;
+
+  DistributionKernel<
+      T, DistOp, TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      size, seed, offset, dist, trans, out_data);
+}
+
+#endif
+
+}  // namespace distribution
+}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index c97a523caa767..a708cbbfaacfc 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
 
 namespace paddle {
 namespace operators {
@@ -180,9 +181,6 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
       return;
     }
 
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(dev_ctx, size);
-
     // increment is used to set the args(offset) of curand_init, which defines
     // offset in subsequence.
     // The detail:
@@ -192,11 +190,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     // same as the previous calls.
     uint64_t seed_data;
     uint64_t increment;
-    int vec_size = platform::GetVectorizedSize<T>(x_data);
-    auto offset = ((x_numel - 1) / (config.block_per_grid.x *
-                                    config.thread_per_block.x * vec_size) +
-                   1) *
-                  vec_size;
+    // VectorizedRandomGenerator use curand_uniform4, so we only support
+    // vec_size is 4;
+    int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
+    int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size);
+    int grid_size =
+        ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
+
+    auto offset =
+        ((x_numel - 1) / (grid_size * block_size * vec_size) + 1) * vec_size;
 
     GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
                             &seed_data, &increment);
@@ -204,26 +206,23 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
 #ifdef __HIPCC__
     if (vec_size == 4 && size % 4 == 0) {
       hipLaunchKernelGGL(
-          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
-          config.block_per_grid, config.thread_per_block, 0, stream, size,
-          seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
-          increment);
+          HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>), grid_size,
+          block_size, 0, stream, size, seed_data, dropout_prob, x_data,
+          mask_data, y_data, upscale_in_train, increment);
     } else {
       hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
-                         config.block_per_grid, config.thread_per_block, 0,
-                         stream, size, seed_data, dropout_prob, x_data,
-                         mask_data, y_data, upscale_in_train, increment);
+                         grid_size, block_size, 0, stream, size, seed_data,
+                         dropout_prob, x_data, mask_data, y_data,
+                         upscale_in_train, increment);
     }
 #else
     if (vec_size == 4 && size % 4 == 0) {
-      VectorizedRandomGenerator<
-          T, uint8_t,
-          4><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      VectorizedRandomGenerator<T, uint8_t,
+                                4><<<grid_size, block_size, 0, stream>>>(
           size, seed_data, dropout_prob, x_data, mask_data, y_data,
           upscale_in_train, increment);
     } else {
-      RandomGenerator<T, uint8_t><<<config.block_per_grid,
-                                    config.thread_per_block, 0, stream>>>(
+      RandomGenerator<T, uint8_t><<<grid_size, block_size, 0, stream>>>(
           size, seed_data, dropout_prob, x_data, mask_data, y_data,
           upscale_in_train, increment);
     }
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a4567beeb4f3d..d6d79d166d00a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -68,7 +68,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Add<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::AddKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 80089243f251b..7a25f65366901 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -23,83 +20,39 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
-template <typename T>
-static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
-                                                       const T* out,
-                                                       const T* dout,
-                                                       int64_t size, T* dx,
-                                                       T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    T o = dout[col];
-    dx[col] = o / y[col];
-    dy[col] = -o * out[col] / y[col];
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <>
-__global__ void
-SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<float>>(
-    const paddle::platform::complex<float>* x,
-    const paddle::platform::complex<float>* y,
-    const paddle::platform::complex<float>* out,
-    const paddle::platform::complex<float>* dout, int64_t size,
-    paddle::platform::complex<float>* dx,
-    paddle::platform::complex<float>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    paddle::platform::complex<float> o = dout[col];
-    paddle::platform::complex<float> y_conj(y[col].real, -y[col].imag);
-    paddle::platform::complex<float> out_div_y_conj((out[col] / y[col]).real,
-                                                    -(out[col] / y[col]).imag);
-    dx[col] = o / y_conj;
-    dy[col] = -o * out_div_y_conj;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <>
-__global__ void
-SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<double>>(
-    const paddle::platform::complex<double>* x,
-    const paddle::platform::complex<double>* y,
-    const paddle::platform::complex<double>* out,
-    const paddle::platform::complex<double>* dout, int64_t size,
-    paddle::platform::complex<double>* dx,
-    paddle::platform::complex<double>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    paddle::platform::complex<double> o = dout[col];
-    paddle::platform::complex<double> y_conj(y[col].real, -y[col].imag);
-    paddle::platform::complex<double> out_div_y_conj((out[col] / y[col]).real,
-                                                     -(out[col] / y[col]).imag);
-    dx[col] = o / y_conj;
-    dy[col] = -o * out_div_y_conj;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
 template <typename DeviceContext, typename T>
 typename std::enable_if<
-    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
-elementwise_div_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
-  dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
-  auto size = x->numel();
-  dim3 grid_size =
-      dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
-  SimpleElemwiseDivGradCUDAKernel<
-      T><<<grid_size, block_size, 0,
-           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-      x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
-      dx->mutable_data<T>(ctx.GetPlace()), dy->mutable_data<T>(ctx.GetPlace()));
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+ElementwiseDivGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  const auto& dev_ctx = ctx.template device_context<DeviceContext>();
+  const auto place = ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    dx->mutable_data<T>(place);
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), place);
+    }
+    std::vector<const framework::Tensor*> ins = {dout, out, y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    dx->mutable_data<T>(place);
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), place);
+    }
+    std::vector<const framework::Tensor*> ins = {dout, y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
+                                                dx, DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const framework::Tensor*> ins = {dout, out, y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor<T>());
+  }
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index f3ba5050c4f53..b13a0539ec6ad 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -62,7 +62,7 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Divide<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::DivideKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
   }
 };
 
@@ -111,26 +111,24 @@ struct DivDoubleDY {
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_div_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
+ElementwiseDivGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
+
   ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
       ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// cuda definition
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-elementwise_div_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy);
+ElementwiseDivGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy);
 #endif
 
 template <typename DeviceContext, typename T>
@@ -146,15 +144,8 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
 
-    if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-      elementwise_div_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-    } else {
-      ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-          ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(),
-          DivGradDY<T>());
-    }
+    ElementwiseDivGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 6e53af41b657c..a62c531ff0733 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/array.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -25,58 +28,31 @@ namespace operators {
 
 // Add
 template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
-};
+using AddFunctor = pten::funcs::AddFunctor<T>;
+
 template <typename T>
-struct InverseAddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
-};
+using InverseAddFunctor = pten::funcs::InverseAddFunctor<T>;
 
 // Subtract
 template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
-};
+using SubFunctor = pten::funcs::SubtractFunctor<T>;
+
 template <typename T>
-struct InverseSubFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
-};
+using InverseSubFunctor = pten::funcs::InverseSubtractFunctor<T>;
 
 // Multiply
 template <typename T>
-struct MulFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
-};
+using MulFunctor = pten::funcs::MultiplyFunctor<T>;
+
 template <typename T>
-struct InverseMulFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
-};
+using InverseMulFunctor = pten::funcs::InverseMultiplyFunctor<T>;
 
 // Divide
-#define DIV_ERROR_INFO                                             \
-  "InvalidArgumentError: Integer division by zero encountered in " \
-  "(floor) divide. Please check the input value."
-
-template <typename T, typename Enable = void>
-struct DivFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
-};
-
 template <typename T>
-struct DivFunctor<T,
-                  typename std::enable_if<std::is_integral<T>::value>::type> {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
-    // For int32/int64, need to check whether the divison is zero.
-    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
-    return a / b;
-  }
-};
+using DivFunctor = pten::funcs::DivideFunctor<T>;
 
-template <typename T, typename Enable = void>
-struct InverseDivFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
-};
+template <typename T>
+using InverseDivFunctor = pten::funcs::InverseDivideFunctor<T>;
 
 // Floor Divide
 template <typename T>
@@ -113,6 +89,71 @@ struct MinFunctor {
   }
 };
 
+template <typename T>
+using Complex = paddle::platform::complex<T>;
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor {
+  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT a,
+                                                                 const InT b,
+                                                                 const InT c) {
+    // dx = dout / y
+    // dy = - dout * out / y
+    paddle::framework::Array<OutT, 2> outs;
+    outs[0] = a / c;
+    outs[1] = -a * b / c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
+  inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
+      const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
+    paddle::framework::Array<Complex<OutT>, 2> outs;
+    Complex<InT> c_conj(c.real, -c.imag);
+    Complex<InT> out_div_c_conj((b / c).real, -(b / c).imag);
+    outs[0] = a / c_conj;
+    outs[1] = -a * out_div_c_conj;
+    return outs;
+  }
+};
+
+// Float div grad
+template <typename T>
+struct DivGradXFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+};
+
+// Complex div grad
+template <typename T>
+struct DivGradXFunctor<Complex<T>> {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
+                                          const Complex<T>& b) const {
+    Complex<T> b_conj(b.real, -b.imag);
+    return a / b_conj;
+  }
+};
+
+// Float mul and div
+template <typename T>
+struct DivGradYFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b, const T& c) const {
+    return -a * b / c;
+  }
+};
+
+// Complex mul and div
+template <typename T>
+struct DivGradYFunctor<Complex<T>> {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
+                                          const Complex<T>& b,
+                                          const Complex<T>& c) const {
+    Complex<T> out_div_c_conj((b / c).real, -(b / c).imag);
+    return -a * out_div_c_conj;
+  }
+};
+
 // Fmax
 template <typename T>
 struct FMaxFunctor {
@@ -153,5 +194,47 @@ struct FMinFunctor<paddle::platform::float16> {
   }
 };
 
+template <typename T>
+struct MulGradFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+template <typename T>
+struct MulGradFunctor<Complex<T>> {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
+                                          const Complex<T>& b) const {
+    Complex<T> b_conj(b.real, -b.imag);
+    return a * b_conj;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MulGradXYFunctor {
+  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT& a,
+                                                                 const InT& b,
+                                                                 const InT& c) {
+    paddle::framework::Array<OutT, 2> outs;
+    // dx = dout * y
+    outs[0] = a * b;
+    // dy = dout * x
+    outs[1] = a * c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
+  inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
+      const Complex<InT>& a, const Complex<InT>& b, const Complex<InT>& c) {
+    paddle::framework::Array<Complex<OutT>, 2> outs;
+    // dx = dout * y
+    Complex<InT> b_conj(b.real, -b.imag);
+    outs[0] = a * b_conj;
+    // dy = dout * x
+    Complex<InT> c_conj(c.real, -c.imag);
+    outs[1] = a * c_conj;
+    return outs;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index e131bc4974661..cdf376fd6a8cc 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -57,7 +58,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::Multiply<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+      pten::MultiplyKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
+                              pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
@@ -67,69 +69,41 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename T>
-static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
-                                                       const T* out,
-                                                       const T* dout,
-                                                       int64_t size, T* dx,
-                                                       T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    T o = dout[col];
-    dx[col] = y[col] * o;
-    dy[col] = x[col] * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<float>>(
-    const plat::complex<float>* x, const plat::complex<float>* y,
-    const plat::complex<float>* out, const plat::complex<float>* dout,
-    int64_t size, plat::complex<float>* dx, plat::complex<float>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    plat::complex<float> o = dout[col];
-    dx[col] = plat::complex<float>(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex<float>(x[col].real, -x[col].imag) * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<double>>(
-    const plat::complex<double>* x, const plat::complex<double>* y,
-    const plat::complex<double>* out, const plat::complex<double>* dout,
-    int64_t size, plat::complex<double>* dx, plat::complex<double>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    plat::complex<double> o = dout[col];
-    dx[col] = plat::complex<double>(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex<double>(x[col].real, -x[col].imag) * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
 template <typename DeviceContext, typename T>
 typename std::enable_if<
-    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
-elementwise_mul_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
-  dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
-  auto size = x->numel();
-  dim3 grid_size =
-      dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
-  SimpleElemwiseMulGradCUDAKernel<
-      T><<<grid_size, block_size, 0,
-           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-      x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
-      dx->mutable_data<T>(ctx.GetPlace()), dy->mutable_data<T>(ctx.GetPlace()));
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+ElementwiseMulGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  const auto place = ctx.GetPlace();
+
+  if (dx != nullptr && dy != nullptr) {
+    dx->mutable_data<T>(place);
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), place);
+    }
+    std::vector<const framework::Tensor*> ins = {dout, y, x};
+    GetGradXAndYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    dx->mutable_data<T>(place);
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), place);
+    }
+    std::vector<const framework::Tensor*> ins = {dout, y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
+                                                dx, MulGradFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const framework::Tensor*> ins = {dout, x};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
+                                                dy, MulGradFunctor<T>());
+  }
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 8b43f82e6b6a1..5cff3173e8115 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -129,7 +129,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::Multiply<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+      pten::MultiplyKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+                              pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
@@ -173,26 +174,23 @@ struct MulGradDY<paddle::platform::complex<T>> {
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_mul_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
+ElementwiseMulGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
   int axis = ctx.Attr<int>("axis");
   ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
       ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// cuda definition
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-elementwise_mul_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy);
+ElementwiseMulGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy);
 #endif
 
 template <typename DeviceContext, typename T>
@@ -208,14 +206,8 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
     auto* out = dout;  // out is not necessary
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-      elementwise_mul_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
-    } else {
-      ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-          ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(),
-          MulGradDY<T>());
-    }
+
+    ElementwiseMulGrad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 30aba42aeee11..25c983566b371 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -22,147 +22,8 @@ namespace operators {
 
 namespace kps = paddle::operators::kernel_primitives;
 
-struct DimensionsTransform {
-  using DimVector = std::vector<int64_t>;
-  typedef void (*MergeFunctor)(bool &, std::vector<DimVector> &, DimVector &,
-                               int, int);
-  int64_t dim_size;
-  DimVector out_dims;
-  std::vector<DimVector> in_dims;
-
- private:
-  // To compensate the lackage of input_tensors` dimension with input variable
-  // 'axis'
-  void InputDimensionsExtend(int N, int axis) {
-    for (auto &in_dim : in_dims) {
-      int64_t in_idx = 0;
-      if (in_dim.size() < dim_size) {
-        DimVector tmp_dim(dim_size, 1);
-        do {
-          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
-            tmp_dim[axis] = in_dim[in_idx];
-            in_idx++;
-            axis++;
-          } else {
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "The %d-th dimension of input tensor is expected to be equal "
-                "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
-                in_idx + 1, axis + 1, out_dims[axis], in_dim[in_idx]));
-          }
-        } while (in_idx < in_dim.size());
-        in_dim.resize(dim_size);
-        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
-      } else {
-        do {
-          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
-            in_idx++;
-          } else {
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "The %d-th dimension of input tensor is expected to be equal "
-                "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
-                in_idx + 1, in_idx + 1, out_dims[in_idx], in_dim[in_idx]));
-          }
-        } while (in_idx < dim_size);
-      }
-      std::reverse(in_dim.begin(), in_dim.end());
-    }
-    std::reverse(out_dims.begin(), out_dims.end());
-  }
-
-  template <typename MergeFunctor>
-  __inline__ void MergeDimensions(MergeFunctor merge_func, int N) {
-    auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) {
-      (*vec)[m_idx - 1] =
-          std::accumulate(vec->begin() + l_idx, vec->begin() + m_idx, 1,
-                          std::multiplies<int64_t>());
-      vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1);
-    };
-
-    int64_t i = 0;
-    while (i < dim_size) {
-      int cnt = 0;
-      int low_idx = i;
-      bool equal = true;
-      do {
-        merge_func(equal, in_dims, out_dims, i, N);
-        if (equal) {
-          i++;
-          cnt++;
-        } else {
-          break;
-        }
-      } while (i < dim_size);
-
-      if (cnt > 1) {
-        for (auto &in_dim : in_dims) {
-          VectorReorganise(&in_dim, low_idx, i);
-        }
-        VectorReorganise(&out_dims, low_idx, i);
-        dim_size -= --cnt;
-        i -= cnt;
-      } else if (cnt < 1) {
-        i++;
-      }
-    }
-  }
-
- public:
-  explicit DimensionsTransform(
-      const std::vector<const framework::Tensor *> &ins,
-      const framework::DDim &dims, int axis) {
-    const int N = ins.size();
-    dim_size = dims.size();
-    out_dims = framework::vectorize<int64_t>(dims);
-    in_dims.resize(N);
-    for (int j = 0; j < N; ++j) {
-      in_dims[j] = framework::vectorize<int64_t>(ins[j]->dims());
-    }
-    InputDimensionsExtend(N, axis);
-
-    auto merge_sequential_dims = [](bool &equal,
-                                    std::vector<DimVector> &in_dims,
-                                    DimVector &out, int i, int num) {
-      for (int j = 1; j < num; ++j) {
-        equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false;
-      }
-    };
-    auto merge_sequential_one_dims = [](bool &equal,
-                                        std::vector<DimVector> &in_dims,
-                                        DimVector &out, int i, int num) {
-      equal = in_dims[0][i] == 1;
-      if (equal) {
-        for (int j = 1; j < num; ++j) {
-          equal &= in_dims[j][i] == out[i];
-        }
-      }
-    };
-    // To Merge the dimensions of input_tensors while the consequtive
-    // equal-dimensions appears.
-    MergeFunctor merge_ptr = merge_sequential_dims;
-    MergeDimensions<MergeFunctor>(merge_ptr, N);
-
-    int min_idx = 0;
-    int min_val = std::accumulate(in_dims[0].begin(), in_dims[0].end(), 1,
-                                  std::multiplies<int64_t>());
-    for (int j = 1; j < N; ++j) {
-      int temp = std::accumulate(in_dims[j].begin(), in_dims[j].end(), 1,
-                                 std::multiplies<int64_t>());
-      min_val = min_val > temp ? temp : min_val;
-      min_idx = min_val == temp ? j : min_idx;
-    }
-    std::swap(in_dims[0], in_dims[min_idx]);
-
-    // To Merge the dimension of input_tensors while the consequtive
-    // 1-value-dimensions appears.
-    merge_ptr = merge_sequential_one_dims;
-    MergeDimensions<MergeFunctor>(merge_ptr, N);
-    std::swap(in_dims[min_idx], in_dims[0]);
-  }
-};
-
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
+          int NumOuts = 1>
 void LaunchBroadcastElementwiseCudaKernel(
     const platform::CUDADeviceContext &ctx,
     const std::vector<const framework::Tensor *> &ins,
@@ -190,11 +51,12 @@ void LaunchBroadcastElementwiseCudaKernel(
   for (int i = 0; i < pt_outputs_tmp.size(); i++) {
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
-  pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT>(
+  pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
       ctx, pt_inputs, &pt_outputs, axis, func);
 }
 
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
+          int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
     const platform::CUDADeviceContext &cuda_ctx,
     const std::vector<const framework::Tensor *> &ins,
@@ -222,8 +84,8 @@ void LaunchElementwiseCudaKernel(
   for (int i = 0; i < pt_outputs_tmp.size(); i++) {
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
-  pten::LaunchElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, pt_inputs,
-                                                   &pt_outputs, axis, func);
+  pten::LaunchElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+      cuda_ctx, pt_inputs, &pt_outputs, axis, func);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 9700ca3584de8..3929699955a17 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -31,8 +31,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
+#include "paddle/pten/kernels/cpu/elementwise.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
@@ -43,6 +42,7 @@ limitations under the License. */
 #include <thrust/iterator/iterator_adaptor.h>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
@@ -151,9 +151,9 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
                                    int *x_dims_array, int *y_dims_array,
                                    int *out_dims_array, const int max_dim,
                                    const int axis) {
-  pten::general::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array,
-                                        y_dims_array, out_dims_array, max_dim,
-                                        axis);
+  pten::funcs::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array,
+                                      y_dims_array, out_dims_array, max_dim,
+                                      axis);
 }
 
 template <typename Functor, typename T, typename OutType = T>
@@ -1073,71 +1073,9 @@ void CommonGradBroadcastCUDA(
 
 inline framework::DDim trim_trailing_singular_dims(
     const framework::DDim &dims) {
-  return pten::general::trim_trailing_singular_dims(dims);
+  return pten::funcs::trim_trailing_singular_dims(dims);
 }
 
-template <typename Functor, typename T, typename DeviceContext,
-          typename OutType = T>
-class TransformFunctor {
- public:
-  TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
-                   framework::Tensor *z, const DeviceContext &ctx, Functor func,
-                   const bool is_xsize_larger = true)
-      : x_(x->data<T>()),
-        y_(y->data<T>()),
-        z_(z->mutable_data<OutType>(ctx.GetPlace())),
-        nx_(x->numel()),
-        ctx_(ctx),
-        func_(func),
-        is_xsize_larger_(is_xsize_larger) {
-    if (is_xsize_larger_ == false) {
-      nx_ = y->numel();
-    }
-  }
-
-  inline void Run() const {
-    platform::Transform<DeviceContext> trans;
-    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
-  }
-
-  inline void RunRowWise(int n, int pre) const {
-    platform::Transform<DeviceContext> trans;
-    if (is_xsize_larger_) {
-      trans(ctx_, x_, x_ + nx_,
-            pten::general::RowwiseTransformIterator<T, DeviceContext>(y_, n),
-            z_, func_);
-    } else {
-      trans(ctx_, y_, y_ + nx_,
-            pten::general::RowwiseTransformIterator<T, DeviceContext>(x_, n),
-            z_, func_);
-    }
-  }
-
-  inline void RunMidWise(int n, int pre, int post) const {
-    platform::Transform<DeviceContext> trans;
-    if (is_xsize_larger_) {
-      trans(ctx_, x_, x_ + nx_,
-            pten::general::MidWiseTransformIterator<T, DeviceContext>(y_, n,
-                                                                      post),
-            z_, func_);
-    } else {
-      trans(ctx_, y_, y_ + nx_,
-            pten::general::MidWiseTransformIterator<T, DeviceContext>(x_, n,
-                                                                      post),
-            z_, func_);
-    }
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  OutType *z_;
-  int64_t nx_;
-  const DeviceContext &ctx_;
-  Functor func_;
-  bool is_xsize_larger_;
-};
-
 template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
 struct ElemwiseGradNoBroadcast {
   const T *x_;
@@ -1457,13 +1395,13 @@ void ElemwiseGradComputeWithBroadcast(
   if (is_xsize_larger) {
     auto y_dims_trimed = trim_trailing_singular_dims(y_dims);
     axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    pten::general::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n,
-                                &post, &is_run_common_broadcast);
+    pten::funcs::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post,
+                              &is_run_common_broadcast);
   } else {
     auto x_dims_trimed = trim_trailing_singular_dims(x_dims);
     axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    pten::general::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n,
-                                &post, &is_run_common_broadcast);
+    pten::funcs::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post,
+                              &is_run_common_broadcast);
   }
   // special case for common backward implementation.
   if (is_run_common_broadcast) {
@@ -1861,8 +1799,8 @@ void FusedElemwiseAndActComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                              &is_run_common_broadcast);
+  pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
+                            &is_run_common_broadcast);
   if (post == 1) {
     int h = pre;
     int w = n;
@@ -2409,8 +2347,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                              &is_run_common_broadcast);
+  pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
+                            &is_run_common_broadcast);
   const T *x_data = nullptr;
   const T *y_data = nullptr;
   if (x->IsInitialized()) x_data = x->data<T>();
@@ -2619,5 +2557,77 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
   }
   return dims;
 }
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
+                   framework::Tensor *src, framework::Tensor *dst) {
+  std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
+  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      *src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx.stream());
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
+                     const platform::Place &place, int axis,
+                     std::vector<const framework::Tensor *> ins,
+                     const framework::Tensor *dout, framework::Tensor *dx,
+                     framework::Tensor *dy, Functor func) {
+  framework::Tensor tmp_dx;
+  framework::Tensor tmp_dy;
+  dy->mutable_data<T>(place);
+  std::vector<framework::Tensor *> outs;
+  if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
+    outs = {dx, dy};
+  } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
+    tmp_dx.mutable_data<T>(dout->dims(), place);
+    outs = {&tmp_dx, dy};
+  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
+    tmp_dy.mutable_data<T>(dout->dims(), place);
+    outs = {dx, &tmp_dy};
+  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
+    tmp_dy.mutable_data<T>(dout->dims(), place);
+    tmp_dx.mutable_data<T>(dout->dims(), place);
+    outs = {&tmp_dx, &tmp_dy};
+  }
+
+  LaunchElementwiseCudaKernel<ET, T, T, decltype(func), 2>(dev_ctx, ins, &outs,
+                                                           axis, func);
+
+  if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+  } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dx, dx);
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dy, dy);
+  }
+}
+
+template <ElementwiseType ET, typename T, typename Functor>
+void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
+                    const platform::Place &place, int axis,
+                    std::vector<const framework::Tensor *> ins,
+                    const framework::Tensor *dout, framework::Tensor *dxy,
+                    Functor func) {
+  framework::Tensor tmp_dxy;
+  dxy->mutable_data<T>(place);
+
+  std::vector<framework::Tensor *> outs;
+  if (dxy->dims() != dout->dims()) {
+    tmp_dxy.mutable_data<T>(dout->dims(), place);
+    outs = {&tmp_dxy};
+  } else {
+    outs = {dxy};
+  }
+
+  LaunchElementwiseCudaKernel<ET, T, T>(dev_ctx, ins, &outs, axis, func);
+  if (dxy->dims() != dout->dims()) {
+    ReduceWrapper<T>(dev_ctx, axis, &tmp_dxy, dxy);
+  }
+}
+
+#endif
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 12fdcd40aa0b1..1d8acd5eca5d9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -23,13 +23,7 @@ limitations under the License. */
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
-
-#ifdef __HIPCC__
-#define ELEMENTWISE_BLOCK_SIZE 256
-#else
-#define ELEMENTWISE_BLOCK_SIZE 512
-#endif
+#include "paddle/pten/kernels/gpu/elementwise.h"
 
 namespace paddle {
 namespace operators {
@@ -38,7 +32,8 @@ namespace kps = paddle::operators::kernel_primitives;
 
 using ElementwiseType = pten::ElementwiseType;
 
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
+          int NumOuts = 1>
 void LaunchSameDimsElementwiseCudaKernel(
     const platform::CUDADeviceContext &ctx,
     const std::vector<const framework::Tensor *> &ins,
@@ -66,8 +61,8 @@ void LaunchSameDimsElementwiseCudaKernel(
   for (int i = 0; i < pt_outputs_tmp.size(); i++) {
     pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
-  pten::LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT>(ctx, pt_inputs,
-                                                           &pt_outputs, func);
+  pten::LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+      ctx, pt_inputs, &pt_outputs, func);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 09a33788d4133..6a51d7c2a45ad 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
 
@@ -56,7 +56,8 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::Subtract<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get());
+    pten::SubtractKernel<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+                            pt_z.get());
   }
 };
 
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index 3d28ca90a5a15..7178097156017 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -109,6 +109,20 @@ class EmptyOp : public framework::OperatorWithKernel {
         framework::proto::VarType::Type(context.Attr<int>("dtype")),
         context.GetPlace());
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext& ctx) const override {
+    std::string shape;
+    if (ctx.HasInput("ShapeTensor")) {
+      shape = "ShapeTensor";
+    } else if (ctx.MultiInput<framework::Tensor>("ShapeTensorList").size()) {
+      shape = "ShapeTensorList";
+    } else {
+      shape = "shape";
+    }
+
+    return framework::KernelSignature("empty", {}, {shape}, {"Out"});
+  }
 };
 
 class EmptyOpVarTypeInference : public framework::VarTypeInference {
diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc
new file mode 100644
index 0000000000000..56a6a80b45dff
--- /dev/null
+++ b/paddle/fluid/operators/erfinv_op.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/erfinv_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ErfinvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "erfinv");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "erfinv");
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ErfinvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of erfinv op.");
+    AddOutput("Out", "(Tensor), The output tensor of erfinv op.");
+    AddComment(R"DOC(
+Erfinv Operator.
+
+This operator is used to compute inverse error function of input $X$.
+
+The equation is:
+
+$$erfinv(x) = {ndtri({x \over 2} + 0.5)} \over {\sqrt{2}}$$
+
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD information with input `X`.
+)DOC");
+  }
+};
+
+class ErfinvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+  }
+};
+
+template <typename T>
+class ErfinvGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("erfinv_grad");
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"});
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(
+    erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker,
+    paddle::operators::ErfinvGradMaker<paddle::framework::OpDesc>,
+    paddle::operators::ErfinvGradMaker<paddle::imperative::OpBase>,
+    paddle::operators::ErfinvInplaceInferer);
+
+REGISTER_OPERATOR(erfinv_grad, paddle::operators::ErfinvGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    erfinv,
+    paddle::operators::ErfinvKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ErfinvKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
+
+REGISTER_OP_CPU_KERNEL(
+    erfinv_grad,
+    paddle::operators::ErfinvGradKernel<paddle::platform::CPUDeviceContext,
+                                        float>,
+    paddle::operators::ErfinvGradKernel<paddle::platform::CPUDeviceContext,
+                                        double>);
diff --git a/paddle/fluid/operators/erfinv_op.cu b/paddle/fluid/operators/erfinv_op.cu
new file mode 100644
index 0000000000000..1fb2dbb97a2df
--- /dev/null
+++ b/paddle/fluid/operators/erfinv_op.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/erfinv_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    erfinv,
+    paddle::operators::ErfinvKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ErfinvKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    erfinv_grad,
+    paddle::operators::ErfinvGradKernel<paddle::platform::CUDADeviceContext,
+                                        float>,
+    paddle::operators::ErfinvGradKernel<paddle::platform::CUDADeviceContext,
+                                        double>);
diff --git a/paddle/fluid/operators/erfinv_op.h b/paddle/fluid/operators/erfinv_op.h
new file mode 100644
index 0000000000000..934d0f4a5a715
--- /dev/null
+++ b/paddle/fluid/operators/erfinv_op.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES  // use M_2_SQRTPI on Windows
+#endif
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+// ndtri(x * 0.5 + 0.5) / sqrt(2)
+template <typename DeviceContext, typename T>
+class ErfinvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    constexpr T half = static_cast<T>(0.5);
+    constexpr T half_sqrt = static_cast<T>(M_SQRT1_2);
+    eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt;
+  }
+};
+
+// sqrt(pi) / 2 * exp(square(out)) * grad
+template <typename DeviceContext, typename T>
+class ErfinvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto out = ctx.Input<framework::Tensor>("Out");
+    auto dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_dout = framework::EigenVector<T>::Flatten(*dout);
+    auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    constexpr T half_sqrt_pi = static_cast<T>(1 / M_2_SQRTPI);
+    eigen_dx.device(place) =
+        half_sqrt_pi * eigen_dout * eigen_out.square().exp();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
old mode 100644
new mode 100755
index 5296a144f6247..cc293a5aaa0b2
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/expand_as_v2_op.h"
 #include <memory>
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -50,6 +51,10 @@ class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
              "X is the input to be expanded.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "Expand X according to the shape of Y.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
               "The rank of Output(Out) have the same with Input(X). "
@@ -144,3 +149,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
+
+REGISTER_OP_VERSION(expand_as_v2)
+    .AddCheckpoint(
+        R"ROC(fix expand_as_v2 and add new input [Y])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "Y", "Expand X according to the shape of Y"));
\ No newline at end of file
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
old mode 100644
new mode 100755
index 3e8f7d15880bc..9e683a792c61f
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -91,17 +91,34 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_NE(target_shape[i], 0,
                         platform::errors::InvalidArgument(
                             "The value of target shape cannot be zero."));
-      if (vec_in_dims[i] != 1) {
+      if (i < diff) {
+        PADDLE_ENFORCE_GT(
+            target_shape[i], 0,
+            platform::errors::InvalidArgument(
+                "The expanded size (%d) for non-existing dimensions must be "
+                "positive for expand_as_v2 op.",
+                target_shape[i]));
+        repeat_times[i] = target_shape[i];
+      } else if (target_shape[i] > 0) {
+        if (vec_in_dims[i] != 1) {
+          PADDLE_ENFORCE_EQ(
+              vec_in_dims[i], target_shape[i],
+              platform::errors::InvalidArgument(
+                  "The value (%d) of the non-singleton dimension does not match"
+                  " the corresponding value (%d) in shape for expand_as_v2 op.",
+                  vec_in_dims[i], target_shape[i]));
+          repeat_times[i] = 1;
+        } else {
+          repeat_times[i] = target_shape[i];
+        }
+      } else {
         PADDLE_ENFORCE_EQ(
-            vec_in_dims[i], target_shape[i],
+            target_shape[i], -1,
             platform::errors::InvalidArgument(
-                "The value (%d) of the non-singleton dimension does not match"
-                " the corresponding value (%d) in "
-                "target tensor for expand_as_v2 op.",
-                vec_in_dims[i], target_shape[i]));
+                "When the value in shape is negative for expand_as_v2 op, "
+                "only -1 is supported, but the value received is %d.",
+                target_shape[i]));
         repeat_times[i] = 1;
-      } else {
-        repeat_times[i] = target_shape[i];
       }
     }
     auto* out0 = context.Output<Tensor>("Out");
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
old mode 100644
new mode 100755
index dc6da979671e5..6d803c500d90f
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -65,7 +65,11 @@ class ExpandV2Op : public framework::OperatorWithKernel {
       if (x_dims[i] == -1) {
         out_shape[i] = -1;
       } else if (expand_shape[i] == -1) {
-        out_shape[i] = x_dims[i];
+        if (static_cast<size_t>(x_dims.size()) > i) {
+          out_shape[i] = x_dims[i];
+        } else {
+          out_shape[i] = -1;
+        }
       } else if (expand_shape[i] == -2) {
         // We use -2 to represent the element in expand_shape is a var.
         out_shape[i] = -1;
diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc
new file mode 100644
index 0000000000000..ee456dcdafbc5
--- /dev/null
+++ b/paddle/fluid/operators/exponential_op.cc
@@ -0,0 +1,137 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/exponential_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ExponentialOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExponentialOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExponentialOp");
+    auto dim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class ExponentialOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddComment(R"DOC(
+This operator fills the input tensor with random values sampled from a
+exponential distribution.
+)DOC");
+    AddInput("X", "The input tensor.");
+    AddOutput("Out", "The output tensor of exponential OP.");
+    AddAttr<float>(
+        "lambda", "lambd parameter of exponential distribution. [default 1.0].")
+        .SetDefault(1.0f);
+  }
+};
+
+class ExponentialOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> &GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
+    return m;
+  }
+};
+
+template <typename T>
+class ExponentialKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    T lambda = static_cast<T>(ctx.Attr<float>("lambda"));
+    int64_t size = out->numel();
+
+    auto gen = framework::DefaultCPUGenerator();
+    auto engine = gen->GetCPUEngine();
+
+    std::uniform_real_distribution<T> uniform(0.0, 1.0);
+    distribution::exponential_transform<T> trans(lambda);
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = trans(uniform(*engine));
+    }
+  }
+};
+
+class ExponentialGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out_Grad", "ExponentialGradOp");
+
+    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputDim(framework::GradVarName("X"), dout_dim);
+  }
+};
+
+template <typename T>
+class ExponentialGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("exponential_grad");
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+DECLARE_INPLACE_OP_INFERER(ExponentialInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(ExponentialGradInferer,
+                           {paddle::framework::GradVarName("Out"),
+                            paddle::framework::GradVarName("X")});
+
+REGISTER_OPERATOR(exponential, ops::ExponentialOp, ops::ExponentialOpMaker,
+                  ops::ExponentialOpInferVarType,
+                  ops::ExponentialGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExponentialGradOpMaker<paddle::imperative::OpBase>,
+                  ExponentialInferer);
+REGISTER_OPERATOR(exponential_grad, ops::ExponentialGradOp,
+                  ExponentialGradInferer);
+
+REGISTER_OP_CPU_KERNEL(exponential,
+                       ops::ExponentialKernel<plat::CPUDeviceContext, float>,
+                       ops::ExponentialKernel<plat::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    exponential_grad, ops::ExponentialGradKernel<plat::CPUDeviceContext, float>,
+    ops::ExponentialGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu
new file mode 100644
index 0000000000000..8b989501e4f42
--- /dev/null
+++ b/paddle/fluid/operators/exponential_op.cu
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/exponential_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ExponentialKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
+    auto& dev_cxt = ctx.template device_context<platform::CUDADeviceContext>();
+    T lambda = static_cast<T>(ctx.Attr<float>("lambda"));
+
+    distribution::uniform_distribution<T> dist;
+    distribution::exponential_transform<T> trans(lambda);
+    distribution::distribution_and_transform<T>(dev_cxt, out, dist, trans);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    exponential, ops::ExponentialKernel<plat::CUDADeviceContext, float>,
+    ops::ExponentialKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    exponential_grad,
+    ops::ExponentialGradKernel<plat::CUDADeviceContext, float>,
+    ops::ExponentialGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h
new file mode 100644
index 0000000000000..d8cafb8ef7f02
--- /dev/null
+++ b/paddle/fluid/operators/exponential_op.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/distribution_helper.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ExponentialKernel;
+
+template <typename DeviceContext, typename T>
+class ExponentialGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<DeviceContext, T> functor;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    functor(dev_ctx, dx, static_cast<T>(0));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 3ad56827f8344..287bbbfa3b343 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/creation.h"
+#include "paddle/pten/kernels/full_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -65,7 +65,7 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
 
     const auto& dev_ctx = context.template device_context<DeviceContext>();
     // call new kernel
-    pten::FullLike<T>(dev_ctx, value, pt_out.get());
+    pten::FullLikeKernel<T>(dev_ctx, value, pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index fd0f42df11875..fa0cab04168d1 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -31,13 +31,9 @@ namespace operators {
 using Tensor = framework::Tensor;
 using SelectedRows = framework::SelectedRows;
 using LoDTensor = framework::LoDTensor;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 template <typename T>
 using Vector = framework::Vector<T>;
-#else
-template <typename T>
-using Vector = framework::CPUVector<T>;
-#endif
 
 template <typename T>
 class FilterByInstagKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index a1b8dd6bae494..6b1ee00b55d62 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -431,6 +431,12 @@ class FlattenContiguousRangeGradOp : public framework::OperatorWithKernel {
                                        ctx, framework::GradVarName("Out")),
                                    ctx.device_context());
   }
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::KernelSignature("flatten_grad",
+                                      {framework::GradVarName("Out"), "XShape"},
+                                      {}, {framework::GradVarName("X")});
+  }
 };
 DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceInferer,
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 7d08a95821138..ef42619bfe4ff 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -21,7 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+#include "paddle/pten/kernels/flatten_grad_kernel.h"
+#include "paddle/pten/kernels/flatten_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -134,7 +136,8 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::Flatten<T>(dev_ctx, *pt_x.get(), start_axis, stop_axis, pt_out.get());
+    pten::FlattenKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), start_axis,
+                                          stop_axis, pt_out.get());
   }
 };
 
@@ -145,15 +148,25 @@ class FlattenContiguousRangeGradKernel : public framework::OpKernel<T> {
     auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
     auto *d_out =
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
-    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    auto *xshape = ctx.Input<framework::LoDTensor>("XShape");
 
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), d_x);
-    d_x->Resize(x_dims);
+    auto &dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x);
+    auto pt_d_out = paddle::experimental::MakePtenDenseTensor(*d_out);
+
+    // Because the holder of xshape may be nullptr, we can't use
+    // MakePtenDenseTensor.
+    // So, we create a new DenseTensor to save the dims of xshape.
+    pten::DenseTensorMeta xshape_meta{pten::TransToPtenDataType(d_x->type()),
+                                      xshape->dims(), d_x->layout()};
+    auto pt_xshape =
+        pten::Empty<T, DeviceContext>(dev_ctx, std::move(xshape_meta));
+
+    // call new kernel
+    pten::FlattenGradKernel<T, DeviceContext>(dev_ctx, *pt_d_out.get(),
+                                              pt_xshape, pt_d_x.get());
   }
 };
 
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
new file mode 100644
index 0000000000000..5fd9c70c04e8b
--- /dev/null
+++ b/paddle/fluid/operators/fold_op.cc
@@ -0,0 +1,274 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/fluid/operators/fold_op.h"
+#include "paddle/fluid/operators/unfold_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FoldOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::NotFound("Input(X) of FoldOp should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("Y"), true,
+        platform::errors::NotFound("Output(Y) of FoldOp should not be null"));
+    auto in_dims = ctx->GetInputDim("X");
+    std::vector<int> output_sizes =
+        ctx->Attrs().Get<std::vector<int>>("output_sizes");
+    std::vector<int> kernel_sizes =
+        ctx->Attrs().Get<std::vector<int>>("kernel_sizes");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> dilations =
+        ctx->Attrs().Get<std::vector<int>>("dilations");
+
+    PADDLE_ENFORCE_EQ(
+        output_sizes.size(), 2,
+        platform::errors::InvalidArgument(
+            "It is expected output_size equals to 2, but got size %d",
+            output_sizes.size()));
+    PADDLE_ENFORCE_EQ(
+        kernel_sizes.size(), 2,
+        platform::errors::InvalidArgument(
+            "It is expected kernel_size equals to 2, but got size %d",
+            kernel_sizes.size()));
+    PADDLE_ENFORCE_EQ(
+        strides.size(), 2,
+        platform::errors::InvalidArgument(
+            "It is expected strides_size equals to 2, but got size %d",
+            strides.size()));
+    PADDLE_ENFORCE_EQ(
+        paddings.size(), 4,
+        platform::errors::InvalidArgument(
+            "It is expected paddings_size equals to 4, but got size %d",
+            paddings.size()));
+    PADDLE_ENFORCE_EQ(
+        dilations.size(), 2,
+        platform::errors::InvalidArgument(
+            "It is expected dilations_size equals to 2, but got size %d",
+            dilations.size()));
+
+    int output_height = output_sizes[0];
+    int output_width = output_sizes[1];
+    int kernel_height = kernel_sizes[0];
+    int kernel_width = kernel_sizes[1];
+    int dilation_height = dilations[0];
+    int dilation_width = dilations[1];
+    int stride_height = strides[0];
+    int stride_width = strides[1];
+
+    // check kernel_sizes
+    PADDLE_ENFORCE_GT(kernel_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The `kernel_sizes` should be greater than zero, "
+                          "but recieved kernel_height: %d kernel_width: %d.",
+                          kernel_sizes[0], kernel_sizes[1]));
+    PADDLE_ENFORCE_GT(kernel_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The `kernel_sizes` should be greater than zero, "
+                          "but recieved kernel_height: %d kernel_width: %d.",
+                          kernel_sizes[0], kernel_sizes[1]));
+    // check strides
+    PADDLE_ENFORCE_GT(stride_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The `strides` should be greater than zero, "
+                          "but recieved strides_height: %d strides_width: %d.",
+                          strides[0], strides[1]));
+    PADDLE_ENFORCE_GT(stride_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The `strides` should be greater than zero, "
+                          "but recieved strides_height: %d strides_width: %d.",
+                          strides[0], strides[1]));
+    // check dilations
+    PADDLE_ENFORCE_GT(
+        dilation_height, 0,
+        platform::errors::InvalidArgument(
+            "The `dilations` should be greater than zero, "
+            "but recieved dilations_height: %d dilations_width: %d.",
+            dilations[0], dilations[1]));
+    PADDLE_ENFORCE_GT(
+        dilation_width, 0,
+        platform::errors::InvalidArgument(
+            "The `dilations` should be greater than zero, "
+            "but recieved dilations_height: %d dilations_width: %d.",
+            dilations[0], dilations[1]));
+
+    std::vector<int> out_dims;
+    // batch_size
+    out_dims.push_back(in_dims[0]);
+    // output_plane
+    int output_channels = in_dims[1] / (kernel_width * kernel_height);
+    out_dims.push_back(output_channels);
+
+    int blocks_height = (output_sizes[0] + 2 * paddings[0] -
+                         (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
+                            strides[0] +
+                        1;
+    int blocks_width = (output_sizes[1] + 2 * paddings[1] -
+                        (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
+                           strides[1] +
+                       1;
+
+    // check output height and width
+    PADDLE_ENFORCE_GT(
+        blocks_height, 0,
+        platform::errors::InvalidArgument(
+            "The sliding blocks calculated from input spatial size (%d, %d), "
+            "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
+            "is (%d, %d), which should be a positive integer.",
+            in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
+            strides[0], strides[1], dilations[0], dilations[1], output_height,
+            output_width));
+
+    PADDLE_ENFORCE_GT(
+        blocks_width, 0,
+        platform::errors::InvalidArgument(
+            "The sliding blocks calculated from input spatial size (%d, %d), "
+            "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
+            "is (%d, %d), which should be a positive integer.",
+            in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
+            strides[0], strides[1], dilations[0], dilations[1], output_height,
+            output_width));
+
+    PADDLE_ENFORCE_EQ(
+        blocks_height * blocks_width, in_dims[1],
+        platform::errors::InvalidArgument(
+            "Given input output_size (%d, %d), "
+            "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), "
+            "which should be expected size of input's dimension "
+            "2 to match the calculated number of %d * %d = %d, but got %d",
+            output_height, output_width, kernel_sizes[0], kernel_sizes[1],
+            strides[0], strides[1], dilations[0], dilations[1], blocks_height,
+            blocks_width, blocks_height * blocks_width, in_dims[2]));
+
+    out_dims.push_back(output_height);
+    out_dims.push_back(output_width);
+    ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class FoldOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "Tensor, "
+             "the input of fold op. "
+             "The format of X is [N, C_in, L], "
+             "where N is the batch size, C_in is the input channels, "
+             "L is the length");
+    AddOutput("Y",
+              "Tensor, "
+              "the output of unfold op. "
+              "The format of Y is [N, C_out, output_height, output_width], "
+              "where N is the batch size, "
+              "C_in is the output channels of Y, output_height and "
+              "output_width "
+              "is the calculated height and width of output feature map.");
+    AddAttr<std::vector<int>>(
+        "output_sizes",
+        "vector<int>, the output sizes of the convolution operator.");
+    AddAttr<std::vector<int>>(
+        "kernel_sizes",
+        "vector<int>, the kernel sizes of the convolution operator.");
+    AddAttr<std::vector<int>>(
+        "strides", "vector<int>, the strides of the convolution operator.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "vector<int>, the paddings applied to pad the feature map.");
+    AddAttr<std::vector<int>>(
+        "dilations", "vector<int>, the dilations of the convolution operator.");
+    AddComment(R"DOC(
+**Fold Operator**
+
+This Operator is used to combines an array of sliding local blocks into a large containing
+tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
+combined value in the resulting large tensor by summing all values from all containing blocks. 
+Unfold extracts the values in the local blocks by copying from the large tensor. So, if the 
+blocks overlap, they are not inverses of each other.
+    )DOC");
+  }
+};
+
+class FoldGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput(framework::GradVarName("Y")), true,
+        platform::errors::NotFound("The gradient of Y should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::NotFound("The input X should not be null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::NotFound("The gradient of X should not be null"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Y")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class FoldGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fold_grad");
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetInput("X", this->Input("X"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(FoldGradOpNoNeedBufferVarsInferer, "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fold, ops::FoldOp, ops::FoldOpMaker,
+                  ops::FoldGradMaker<paddle::framework::OpDesc>,
+                  ops::FoldGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fold_grad, ops::FoldGradOp,
+                  ops::FoldGradOpNoNeedBufferVarsInferer);
+
+REGISTER_OP_CPU_KERNEL(
+    fold, ops::FoldOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FoldOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    fold_grad, ops::FoldGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FoldGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fold_op.cu b/paddle/fluid/operators/fold_op.cu
new file mode 100644
index 0000000000000..b2aa0728c6251
--- /dev/null
+++ b/paddle/fluid/operators/fold_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fold_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    fold, ops::FoldOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FoldOpKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fold_grad,
+    ops::FoldGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FoldGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fold_op.h b/paddle/fluid/operators/fold_op.h
new file mode 100644
index 0000000000000..d37edbfe80375
--- /dev/null
+++ b/paddle/fluid/operators/fold_op.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FoldOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("X");
+    const int batch_size = static_cast<int>(input->dims()[0]);
+    Tensor* output = ctx.Output<Tensor>("Y");
+    output->mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> output_sizes = ctx.Attr<std::vector<int>>("output_sizes");
+    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+
+    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    auto input_dims = input->dims();
+
+    int output_height = (output_sizes[0] + 2 * paddings[0] -
+                         (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
+                            strides[0] +
+                        1;
+    int output_width = (output_sizes[1] + 2 * paddings[1] -
+                        (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
+                           strides[1] +
+                       1;
+
+    int n_input_plane = input_dims[1];
+    int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]);
+
+    framework::DDim output_shape(
+        {n_output_plane, output_sizes[0], output_sizes[1]});
+
+    framework::DDim input_matrix_shape({input_dims[0], kernel_sizes[0],
+                                        kernel_sizes[1], output_height,
+                                        output_width});
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    for (int i = 0; i < batch_size; i++) {
+      Tensor out_batch =
+          output->Slice(i, i + 1).Resize(output_shape);  // im size=3
+      Tensor in_batch =
+          input->Slice(i, i + 1).Resize(input_matrix_shape);  // col size=5
+      col2im(dev_ctx, in_batch, dilations, strides, paddings, &out_batch);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FoldGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* output_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(ctx.GetPlace());
+
+    if ((!output_grad) || (!input_grad)) return;
+
+    std::vector<int> output_sizes = ctx.Attr<std::vector<int>>("output_sizes");
+    std::vector<int> kernel_sizes = ctx.Attr<std::vector<int>>("kernel_sizes");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input_grad->dims()[0]);
+
+    auto input_dims = input_grad->dims();
+
+    int output_height = (output_sizes[0] + 2 * paddings[0] -
+                         (dilations[0] * (kernel_sizes[0] - 1) + 1)) /
+                            strides[0] +
+                        1;
+    int output_width = (output_sizes[1] + 2 * paddings[1] -
+                        (dilations[1] * (kernel_sizes[1] - 1) + 1)) /
+                           strides[1] +
+                       1;
+
+    int n_input_plane = input_dims[1];
+    int n_output_plane = n_input_plane / (kernel_sizes[0] * kernel_sizes[1]);
+
+    framework::DDim output_shape(
+        {n_output_plane, output_sizes[0], output_sizes[1]});
+    framework::DDim input_matrix_shape({input_dims[0], kernel_sizes[0],
+                                        kernel_sizes[1], output_height,
+                                        output_width});
+
+    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    for (int i = 0; i < batch_size; i++) {
+      Tensor out_grad_batch = output_grad->Slice(i, i + 1).Resize(output_shape);
+      Tensor in_grad_batch =
+          input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
+      im2col(dev_ctx, out_grad_batch, dilations, strides, paddings,
+             &in_grad_batch);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 7666ea7aee23c..f2f6b6bfe01d1 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -191,9 +191,10 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num,
 
   int num_block = (max_threads / left_num);
   if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
-    *blocking_size = details::GetLastPow2(reduce_num / num_block);
+    *blocking_size =
+        pten::kernels::details::GetLastPow2(reduce_num / num_block);
     if (*blocking_size <= 1) {
-      *blocking_size = details::GetLastPow2(sqrt(reduce_num));
+      *blocking_size = pten::kernels::details::GetLastPow2(sqrt(reduce_num));
     } else if (*blocking_size * 2 < reduce_num) {
       *blocking_size *= 2;
     }
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 33fde64164d12..3972c60e8347b 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -51,6 +51,18 @@ struct DropoutParam {
     seed_val = 0;
   }
 
+  DropoutParam(bool fix_seed_, uint64_t seed_, bool is_test_,
+               bool is_upscale_in_train_, float dropout_prob_,
+               const framework::Tensor* tensor_seed_, int seed_val_) {
+    fix_seed = fix_seed_;
+    seed = seed_;
+    is_test = is_test_;
+    is_upscale_in_train = is_upscale_in_train_;
+    dropout_prob = dropout_prob_;
+    tensor_seed = tensor_seed_;
+    seed_val = seed_val_;
+  }
+
   /**
    * dropout_index: can be 0, 1, 2. 0 means there is only one dropout,
    * 1 and 2 represent two dropout, the parameter name of dropout
@@ -238,11 +250,14 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
   }
 
   // out = layernorm(residual + dropout(src + bias))
-  void LayernormResidualDropoutBias(
-      const platform::CUDADeviceContext& ctx, const T* src, const T* residual,
-      const T* bias, const LayerNormParamType<T>* gamma,
-      const LayerNormParamType<T>* beta, T* dropout_out, MaskType* mask, T* out,
-      LayerNormParamType<T>* mean, LayerNormParamType<T>* variance) {
+  template <typename P = LayerNormParamType<T>, bool is_same_type = false>
+  void LayernormResidualDropoutBias(const platform::CUDADeviceContext& ctx,
+                                    const T* src, const T* residual,
+                                    const T* bias, const P* gamma,
+                                    const P* beta, T* dropout_out,
+                                    MaskType* mask, T* out,
+                                    LayerNormParamType<T>* mean,
+                                    LayerNormParamType<T>* variance) {
     using U = LayerNormParamType<T>;
     int vec_size = MAX_CACHE_BYTES / sizeof(T);
     if (this->cols_ % vec_size != 0) {
@@ -251,7 +266,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
     int threads = GetDesiredBlockDim(this->cols_ / vec_size);
     int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size;
     increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment);
-    LaunchLayernormResidualDropoutBias<T, MaskType>(
+    LaunchLayernormResidualDropoutBias<T, MaskType, U, is_same_type>(
         this->rows_, this->cols_, increment, this->dropout_param_.seed,
         this->dropout_param_.dropout_prob, epsilon_,
         this->dropout_param_.is_upscale_in_train, this->dropout_param_.is_test,
@@ -259,17 +274,19 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
         variance, ctx);
   }
 
-  void LayernormResidualDropoutBiasGrad(
-      const platform::CUDADeviceContext& ctx, const T* d_out,
-      const T* layernorm_src, const MaskType* mask,
-      const LayerNormParamType<T>* gamma, const LayerNormParamType<T>* mean,
-      const LayerNormParamType<T>* variance, T* d_layernorm_src,
-      LayerNormParamType<T>* d_scale, LayerNormParamType<T>* d_layernorm_bias,
-      T* d_dropout_src, T* d_bias, T* d_residual) {
+  template <typename P = LayerNormParamType<T>, bool is_same_type = false>
+  void LayernormResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx,
+                                        const T* d_out, const T* layernorm_src,
+                                        const MaskType* mask, const P* gamma,
+                                        const LayerNormParamType<T>* mean,
+                                        const LayerNormParamType<T>* variance,
+                                        T* d_layernorm_src, P* d_scale,
+                                        P* d_layernorm_bias, T* d_dropout_src,
+                                        T* d_bias, T* d_residual) {
     using U = LayerNormParamType<T>;
-    LayerNormBackward<T, U>(layernorm_src, d_out, gamma, mean, variance,
-                            d_layernorm_src, d_scale, d_layernorm_bias,
-                            epsilon_, this->rows_, this->cols_, ctx);
+    LayerNormBackward<T, U, is_same_type>(
+        layernorm_src, d_out, gamma, mean, variance, d_layernorm_src, d_scale,
+        d_layernorm_bias, epsilon_, this->rows_, this->cols_, ctx);
     this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src,
                                   d_residual, d_bias);
   }
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index 1827e137c15f1..b27b70dc9dc0c 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -24,46 +24,57 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
+template <typename T, typename U, bool ScaleBiasWithSameTypeX>
+using LayerNormScaleBiasT =
+    typename std::conditional<ScaleBiasWithSameTypeX, T, U>::type;
+
 /**
  * @brief fused add_bias, dropout, add residual and leyer_norm into one
  * operators. Currently only support forward
  */
 
-template <typename T, int VecSize>
-__device__ void CalcLayernormY(const LayerNormParamType<T> *scale,
-                               const LayerNormParamType<T> *bias, const T *x,
-                               T *y, const int row_id, const int col_id,
-                               const int cols,
-                               const LayerNormParamType<T> mean_val,
-                               const LayerNormParamType<T> invvar) {
-  using U = LayerNormParamType<T>;
+template <typename T, int VecSize, typename U,
+          bool ScaleBiasWithSameTypeX = false>
+__device__ void CalcLayernormY(
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias, const T *x,
+    T *y, const int row_id, const int col_id, const int cols,
+    const LayerNormParamType<T> mean_val, const LayerNormParamType<T> invvar) {
   using LoadT = platform::AlignedVector<T, VecSize>;
   using StoreT = platform::AlignedVector<T, VecSize>;
   using LoadU = platform::AlignedVector<U, VecSize>;
+  using LoadScaleOrBias =
+      platform::AlignedVector<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                              VecSize>;
   for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) {
-    LoadU scale_vec;
-    LoadU bias_vec;
+    LoadScaleOrBias scale_vec;
+    LoadScaleOrBias bias_vec;
     LoadT x_vec;
 #pragma unroll
     for (int ii = 0; ii < VecSize; ii++) {
-      scale_vec[ii] = static_cast<U>(1);
-      bias_vec[ii] = static_cast<U>(0);
+      scale_vec[ii] =
+          static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(1);
+      bias_vec[ii] =
+          static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(0);
     }
     // vectorize load data from global
     platform::Load<T, VecSize>(&x[row_id * cols + i], &x_vec);
 
     if (scale != nullptr) {
-      platform::Load<U, VecSize>(&scale[i], &scale_vec);
+      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                     VecSize>(&scale[i], &scale_vec);
     }
     if (bias != nullptr) {
-      platform::Load<U, VecSize>(&bias[i], &bias_vec);
+      platform::Load<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,
+                     VecSize>(&bias[i], &bias_vec);
     }
 
     StoreT y_vec;
     for (int ii = 0; ii < VecSize; ii++) {
-      y_vec[ii] = static_cast<T>(
-          scale_vec[ii] * (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
-          bias_vec[ii]);
+      y_vec[ii] =
+          static_cast<T>(static_cast<U>(scale_vec[ii]) *
+                             (static_cast<U>(x_vec[ii]) - mean_val) * invvar +
+                         static_cast<U>(bias_vec[ii]));
     }
     platform::Store<T, VecSize>(y_vec, &y[row_id * cols + i]);
   }
@@ -85,15 +96,17 @@ __device__ void CalcLayernormY(const LayerNormParamType<T> *scale,
  * means: [rows]: layernorm means
  * vars: [rows]: layernorm vars
  */
-template <typename T, typename MaskType, int VecSize>
+template <typename T, typename MaskType, int VecSize, typename U,
+          bool ScaleBiasWithSameTypeX = false>
 __global__ void FusedLayernormResidualDropoutBias(
     const size_t rows, const size_t cols, uint64_t seed,
     const float dropout_prob, const bool is_upscale_in_train,
     const bool is_test, const uint64_t increment, const float epsilon,
     const T *src, const T *residual, const T *bias,
-    const LayerNormParamType<T> *scale,
-    const LayerNormParamType<T> *layernorm_bias, MaskType *mask, T *dst,
-    T *layernorm_dst, LayerNormParamType<T> *mean, LayerNormParamType<T> *var) {
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *layernorm_bias,
+    MaskType *mask, T *dst, T *layernorm_dst, LayerNormParamType<T> *mean,
+    LayerNormParamType<T> *var) {
   int col_id = threadIdx.x;
   int row_id = blockIdx.x;
   int idx = row_id * cols + col_id;
@@ -101,7 +114,6 @@ __global__ void FusedLayernormResidualDropoutBias(
   curand_init(seed, idx, increment, &state);
 
   T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
-  using U = LayerNormParamType<T>;
 
   __shared__ U mean_share;
   __shared__ U var_share;
@@ -121,10 +133,12 @@ __global__ void FusedLayernormResidualDropoutBias(
   mean_val = BlockReduceSum<U>(mean_val, shared_mean);
   var_val = BlockReduceSum<U>(var_val, shared_var);
   if (threadIdx.x == 0) {
-    auto scale = static_cast<float>(1.) / static_cast<float>(cols);
-    auto tmp = mean_val * scale;
+    auto scale = static_cast<LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(
+        static_cast<float>(1.) / static_cast<float>(cols));
+    auto tmp = mean_val * static_cast<U>(scale);
     mean[row_id] = mean_share = static_cast<U>(tmp);
-    var_share = static_cast<U>(var_val * scale - mean_share * mean_share);
+    var_share = static_cast<U>(var_val * static_cast<U>(scale) -
+                               mean_share * mean_share);
     var_share = var_share > U(0) ? var_share : U(0);
     var[row_id] = var_share;
   }
@@ -134,8 +148,9 @@ __global__ void FusedLayernormResidualDropoutBias(
   U invvar = rsqrt_<U>(var_share + static_cast<U>(epsilon));
 
   // calculate layernorm_dst
-  CalcLayernormY<T, VecSize>(scale, layernorm_bias, dst, layernorm_dst, row_id,
-                             col_id, cols, mean_val, invvar);
+  CalcLayernormY<T, VecSize, U, ScaleBiasWithSameTypeX>(
+      scale, layernorm_bias, dst, layernorm_dst, row_id, col_id, cols, mean_val,
+      invvar);
 }
 
 /**
@@ -154,16 +169,17 @@ __global__ void FusedLayernormResidualDropoutBias(
  * means: [rows]: layernorm means
  * vars: [rows]: layernorm vars
  */
-template <typename T, typename MaskType>
+template <typename T, typename MaskType, typename U,
+          bool ScaleBiasWithSameTypeX = false>
 void LaunchLayernormResidualDropoutBias(
     const uint32_t rows, const uint32_t cols, const int increment,
     uint64_t seed, const float dropout_prob, const float epsilon,
     const bool is_upscale_in_train, const bool is_test, const T *src,
-    const T *residual, const T *bias, const LayerNormParamType<T> *scale,
-    const LayerNormParamType<T> *layernorm_bias, MaskType *mask_data, T *dst,
-    T *layernorm_dst, LayerNormParamType<T> *mean, LayerNormParamType<T> *var,
-    const platform::CUDADeviceContext &ctx) {
-  using U = LayerNormParamType<T>;
+    const T *residual, const T *bias,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
+    const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *layernorm_bias,
+    MaskType *mask_data, T *dst, T *layernorm_dst, LayerNormParamType<T> *mean,
+    LayerNormParamType<T> *var, const platform::CUDADeviceContext &ctx) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
@@ -175,8 +191,9 @@ void LaunchLayernormResidualDropoutBias(
     // call layernorm forward
     switch (GetDesiredBlockDim(cols)) {
       FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, U,
-                           kBlockDim><<<rows, kBlockDim, 0, ctx.stream()>>>(
+          LayerNormForward<
+              T, U, kBlockDim,
+              ScaleBiasWithSameTypeX><<<rows, kBlockDim, 0, ctx.stream()>>>(
               dst, scale, layernorm_bias, layernorm_dst, mean, var, epsilon,
               cols));
       default:
@@ -184,21 +201,24 @@ void LaunchLayernormResidualDropoutBias(
             "Product from begin_norm_axis to end must be larger than 1"));
         break;
     }
+
     return;
   }
 
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (cols % VecSize != 0) {
     int blockDim = GetDesiredBlockDim(cols);
-    FusedLayernormResidualDropoutBias<T, uint8_t,
-                                      1><<<rows, blockDim, 0, ctx.stream()>>>(
+    FusedLayernormResidualDropoutBias<
+        T, uint8_t, 1, U,
+        ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
         rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
         epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
         layernorm_dst, mean, var);
   } else {
     int blockDim = GetDesiredBlockDim(cols / VecSize);
     FusedLayernormResidualDropoutBias<
-        T, uint8_t, VecSize><<<rows, blockDim, 0, ctx.stream()>>>(
+        T, uint8_t, VecSize, U,
+        ScaleBiasWithSameTypeX><<<rows, blockDim, 0, ctx.stream()>>>(
         rows, cols, seed, dropout_prob, is_upscale_in_train, is_test, increment,
         epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
         layernorm_dst, mean, var);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index 50e3555b4bcd6..57d3fc94dc88a 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -223,7 +223,7 @@ struct TestFusedLayernormResidualDropoutBias {
       layernorm_bias_ptr = layernorm_bias.data<U>();
     }
 
-    paddle::operators::LaunchLayernormResidualDropoutBias<T, uint8_t>(
+    paddle::operators::LaunchLayernormResidualDropoutBias<T, uint8_t, U, false>(
         rows, cols, increment, seed, dropout_prob, epsilon, is_upscale_in_train,
         is_test, src.data<T>(), residual.data<T>(), bias_ptr, scale_ptr,
         layernorm_bias_ptr, mask.data<uint8_t>(), out.data<T>(),
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cc b/paddle/fluid/operators/gather_scatter_kernel.cc
new file mode 100644
index 0000000000000..285e857e976e0
--- /dev/null
+++ b/paddle/fluid/operators/gather_scatter_kernel.cc
@@ -0,0 +1,202 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class TensorAssign {
+ public:
+  template <typename tensor_t>
+  void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data;
+  }
+};
+static TensorAssign tensor_assign;
+
+class ReduceAdd {
+ public:
+  template <typename tensor_t>
+  void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data += *src_data;
+  }
+};
+static ReduceAdd reduce_add;
+
+class ReduceMultiply {
+ public:
+  template <typename tensor_t>
+  void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data *= *src_data;
+  }
+};
+static ReduceMultiply reduce_mul;
+
+template <typename tensor_t, typename index_t = int64_t,
+          bool is_scatter_like = true>
+struct cpu_gather_scatter_functor {
+  template <typename func_t>
+  void operator()(Tensor self, int dim, const Tensor& index, const Tensor& src,
+                  const std::string& method_name, const func_t& reduce_op,
+                  const platform::DeviceContext& ctx) {
+    if (index.numel() == 0) {
+      return;
+    }
+    auto* self_data = self.data<tensor_t>();
+    auto* index_data = index.data<index_t>();
+    auto* src_data = src.data<tensor_t>();
+    int64_t self_size = self.numel();
+    int64_t index_size = index.numel();
+    int64_t src_size = src.numel();
+    auto self_dims = self.dims();
+    auto index_dims = index.dims();
+    auto src_dims = src.dims();
+    if (self_size == 0 || src_size == 0 || index_size == 0) {
+      VLOG(3) << "zero size input found";
+      platform::errors::InvalidArgument(
+          "self_size, src_size, index_size cannot be 0");
+      return;
+    }
+    int select_dim_size = index_dims[dim];
+    // index matrix has different shape with self matrix or src matrix.
+    int replaced_select_dim_size =
+        is_scatter_like ? self_dims[dim] : src_dims[dim];
+    int64_t inner_dim_size = 1;
+    int64_t outer_dim_size = 1;
+    for (int64_t i = 0; i < dim; ++i) {
+      inner_dim_size *= index_dims[i];
+    }
+
+    for (int i = dim + 1; i < index_dims.size(); i++) {
+      outer_dim_size *= index_dims[i];
+    }
+    int64_t index_idx = 0;
+    int64_t self_idx, src_idx;
+
+    // N layer loop squeezed into 3 layers loop
+    for (int64_t i = 0; i < inner_dim_size; i++) {
+      for (int64_t j = 0; j < select_dim_size; j++) {
+        for (int64_t k = 0; k < outer_dim_size; k++) {
+          int64_t index = index_data[index_idx];
+
+          /*
+            gather computation formula:
+
+            self[i][j][k] = src[index[i][j][k]][j][k]  # if dim == 0
+            self[i][j][k] = src[i][index[i][j][k]][k]  # if dim == 1
+            self[i][j][k] = src[i][j][index[i][j][k]]  # if dim == 2
+
+            scatter computation formula:
+
+            self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+
+          */
+
+          // This index might out of bound of index matrix's index, so here
+          // multiply the replaced_select_dim_size.
+          int64_t replace_index = k + index * outer_dim_size +
+                                  i * outer_dim_size * replaced_select_dim_size;
+
+          self_idx = is_scatter_like ? replace_index : index_idx;
+          src_idx = is_scatter_like ? index_idx : replace_index;
+
+          reduce_op((tensor_t*)(self_data + self_idx),
+                    (tensor_t*)(src_data + src_idx));
+          index_idx++;
+        }
+      }
+    }
+  }
+};
+
+template <typename tensor_t, typename index_t>
+void cpu_gather_kernel(Tensor self, int dim, const Tensor& index, Tensor result,
+                       const platform::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t, index_t,
+                             /*is_scatter_like=*/false>()(
+      result, dim, index, self, "gather_out_cpu", tensor_assign, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_assign_kernel(Tensor self, int dim, const Tensor& index,
+                               Tensor src, const platform::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t, index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_assign_cpu", tensor_assign, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index,
+                            Tensor src, const platform::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t, index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_add_cpu", reduce_add, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_kernel(Tensor self, int dim, const Tensor& index,
+                            Tensor src, const platform::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t, index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mul_cpu", reduce_mul, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index,
+                                   Tensor output,
+                                   const platform::DeviceContext& ctx) {
+  auto* index_data = index.data<index_t>();
+  auto* output_data = output.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto output_dims = output.dims();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int select_dim_size = index_dims[dim];
+  int output_select_dim_size = output_dims[dim];
+  for (int64_t i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+  }
+
+  int64_t index_idx = 0;
+  for (int64_t i = 0; i < inner_dim_size; i++) {
+    for (int64_t j = 0; j < select_dim_size; j++) {
+      for (int64_t k = 0; k < outer_dim_size; k++) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index = k + index * outer_dim_size +
+                                i * outer_dim_size * output_select_dim_size;
+        output_data[replace_index] = 0;
+        index_idx++;
+      }
+    }
+  }
+}
+
+Instantiate_Template_Function(cpu_gather_kernel)
+    Instantiate_Template_Function(cpu_scatter_assign_kernel)
+        Instantiate_Template_Function(cpu_scatter_add_kernel)
+            Instantiate_Template_Function(cpu_scatter_mul_kernel)
+                Instantiate_Template_Function(cpu_scatter_input_grad_kernel)
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu
new file mode 100644
index 0000000000000..dc87fc52aacb4
--- /dev/null
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -0,0 +1,238 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class TensorAssign {
+ public:
+  template <typename tensor_t>
+  constexpr void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data;
+  }
+};
+static TensorAssign tensor_assign;
+
+class ReduceAdd {
+ public:
+  template <
+      typename tensor_t,
+      std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    platform::CudaAtomicAdd(self_data, *src_data);
+  }
+  template <typename tensor_t,
+            std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data += *src_data;
+  }
+};
+static ReduceAdd reduce_add;
+
+class ReduceMul {
+ public:
+  template <typename tensor_t>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data *= *src_data;
+    // TODO(huangxu96) platform::CudaAtomicMul(*self_data, *src_data);
+  }
+};
+static ReduceMul reduce_mul;
+
+template <typename tensor_t, typename index_t, typename func_t,
+          bool is_scatter_like = true>
+__global__ void GatherScatterGPUKernel(
+    tensor_t* self_data, int dim, const index_t* index_data, tensor_t* src_data,
+    int64_t inner_dim_size, int select_dim_size, int replaced_select_dim_size,
+    int64_t outer_dim_size, int64_t numel, const func_t& reduce_op) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;  // The i, j, k here is the index of the 3 layers loop
+                    // squeezed from the N layers loop.
+  /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  /*
+    gather computation formula:
+
+    self[i][j][k] = src[index[i][j][k]][j][k]  # if dim == 0
+    self[i][j][k] = src[i][index[i][j][k]][k]  # if dim == 1
+    self[i][j][k] = src[i][j][index[i][j][k]]  # if dim == 2
+
+    scatter computation formula:
+
+    self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+
+  */
+  // index matrix has different shape with self matrix or src matrix.
+  int64_t replace_index = k + index * outer_dim_size +
+                          i * outer_dim_size * replaced_select_dim_size;
+  int64_t self_idx = is_scatter_like ? replace_index : tid;
+  int64_t src_idx = is_scatter_like ? tid : replace_index;
+  reduce_op((tensor_t*)(self_data + self_idx), (tensor_t*)(src_data + src_idx));
+}
+
+template <typename tensor_t, typename index_t = int64_t,
+          bool is_scatter_like = true>
+struct gpu_gather_scatter_functor {
+  template <typename func_t>
+  void operator()(Tensor self, int dim, const Tensor& index, Tensor src,
+                  const std::string& method_name, const func_t& reduce_op,
+                  const platform::DeviceContext& ctx) {
+    if (index.numel() == 0) {
+      return;
+    }
+    auto* self_data = self.data<tensor_t>();
+    auto* index_data = index.data<index_t>();
+    auto* src_data = src.data<tensor_t>();
+    int64_t self_size = self.numel();
+    int64_t index_size = index.numel();
+    int64_t src_size = src.numel();
+    auto self_dims = self.dims();
+    auto index_dims = index.dims();
+    auto src_dims = src.dims();
+    if (self_size == 0 || src_size == 0 || index_size == 0) return;
+    int select_dim_size = index_dims[dim];
+    // index matrix has different shape with self matrix or src matrix.
+    int replaced_select_dim_size =
+        is_scatter_like ? self_dims[dim] : src_dims[dim];
+    int64_t inner_dim_size = 1;
+    int64_t outer_dim_size = 1;
+    for (int64_t i = 0; i < index_dims.size(); ++i) {
+      inner_dim_size *= index_dims[i];
+    }
+
+    for (int i = dim + 1; i < index_dims.size(); i++) {
+      outer_dim_size *= index_dims[i];
+    }
+
+    int64_t slice_size = 1;
+    for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+    int block = 512;
+    int64_t n = slice_size * index_size;
+    int64_t grid = (n + block - 1) / block;
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    GatherScatterGPUKernel<tensor_t, index_t, func_t,
+                           is_scatter_like><<<grid, block, 0, stream>>>(
+        self_data, dim, index_data, src_data, inner_dim_size, select_dim_size,
+        replaced_select_dim_size, outer_dim_size, index_size, reduce_op);
+  }
+};  // struct gpu_gather_scatter_functor
+
+template <typename tensor_t, typename index_t>
+void gpu_gather_kernel(Tensor self, int dim, const Tensor& index, Tensor result,
+                       const platform::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t, index_t,
+                             /*is_scatter_like=*/false>()(
+      result, dim, index, self, "gather_out_gpu", tensor_assign, ctx);
+  return;
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_assign_kernel(Tensor self, int dim, const Tensor& index,
+                               Tensor src, const platform::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t, index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_assign_gpu", tensor_assign, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index,
+                            Tensor src, const platform::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t, index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_add_gpu", reduce_add, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_kernel(Tensor self, int dim, const Tensor& index,
+                            Tensor src, const platform::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t, index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mul_gpu", reduce_mul, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterInputGradGPUKernel(
+    tensor_t* grad_data, int dim, const index_t* index_data,
+    int64_t inner_dim_size, int select_dim_size, int grad_select_dim_size,
+    int64_t outer_dim_size, int64_t numel) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index =
+      k + index * outer_dim_size + i * outer_dim_size * grad_select_dim_size;
+  grad_data[replace_index] = 0;
+}
+template <typename tensor_t, typename index_t>
+void gpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index,
+                                   Tensor grad,
+                                   const platform::DeviceContext& ctx) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+  int64_t index_size = index.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int select_dim_size = index_dims[dim];
+  int grad_select_dim_size = grad_dims[dim];
+  for (int64_t i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+  }
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < grad_dims.size(); ++i) slice_size *= grad_dims[i];
+
+  int block = 512;
+  int64_t n = slice_size * index_size;
+  int64_t grid = (n + block - 1) / block;
+  auto stream =
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+
+  ScatterInputGradGPUKernel<tensor_t, index_t><<<grid, block, 0, stream>>>(
+      grad_data, dim, index_data, inner_dim_size, select_dim_size,
+      grad_select_dim_size, outer_dim_size, index_size);
+}
+Instantiate_Template_Function(gpu_gather_kernel)
+    Instantiate_Template_Function(gpu_scatter_assign_kernel)
+        Instantiate_Template_Function(gpu_scatter_add_kernel)
+            Instantiate_Template_Function(gpu_scatter_mul_kernel)
+                Instantiate_Template_Function(gpu_scatter_input_grad_kernel)
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_scatter_kernel.h b/paddle/fluid/operators/gather_scatter_kernel.h
new file mode 100644
index 0000000000000..1cbc18969186b
--- /dev/null
+++ b/paddle/fluid/operators/gather_scatter_kernel.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tensor.h"
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+
+#define Instantiate_Template_Function(func)                                  \
+  Instantiate_Template_Function_index_t(                                     \
+      func, int) Instantiate_Template_Function_index_t(func, float)          \
+      Instantiate_Template_Function_index_t(func, double)                    \
+          Instantiate_Template_Function_index_t(func, int64_t)               \
+              Instantiate_Template_Function_index_t(func, platform::float16) \
+                  Instantiate_Template_Function_index_t(func, unsigned char)
+
+#define Instantiate_Template_Function_index_t(func, tensor_t)               \
+  template void func<tensor_t, int>(Tensor input, int dim,                  \
+                                    const Tensor& index, Tensor result,     \
+                                    const platform::DeviceContext& ctx);    \
+  template void func<tensor_t, int64_t>(Tensor input, int dim,              \
+                                        const Tensor& index, Tensor result, \
+                                        const platform::DeviceContext& ctx);
+
+using Tensor = framework::Tensor;
+
+template <typename tensor_t, typename index_t>
+void cpu_gather_kernel(Tensor self, int dim, const Tensor& index, Tensor result,
+                       const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_assign_kernel(Tensor self, int dim, const Tensor& index,
+                               Tensor src, const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index,
+                            Tensor src, const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_kernel(Tensor self, int dim, const Tensor& index,
+                            Tensor src, const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index,
+                                   Tensor result,
+                                   const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_gather_kernel(Tensor self, int dim, const Tensor& index, Tensor result,
+                       const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_assign_kernel(Tensor self, int dim, const Tensor& index,
+                               Tensor src, const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_add_kernel(Tensor self, int dim, const Tensor& index,
+                            Tensor src, const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_kernel(Tensor self, int dim, const Tensor& index,
+                            Tensor src, const platform::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_input_grad_kernel(Tensor self, int dim, const Tensor& index,
+                                   Tensor result,
+                                   const platform::DeviceContext& ctx);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 2ea432db6c7f0..ef0e000b25efd 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
 
 namespace paddle {
@@ -38,10 +39,12 @@ struct GaussianGenerator {
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
-    thrust::normal_distribution<T> dist(mean_, std_);
+    using MT = typename details::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(mean_, std_);
     unsigned int new_n = n + offset_;
     rng.discard(new_n);
-    return dist(rng);
+    MT out = dist(rng);
+    return static_cast<T>(out);
   }
 };
 
@@ -124,10 +127,14 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(gaussian_random,
-                        paddle::operators::GPUGaussianRandomKernel<float>,
-                        paddle::operators::GPUGaussianRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    gaussian_random,
+    paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
+    paddle::operators::GPUGaussianRandomKernel<float>,
+    paddle::operators::GPUGaussianRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(
     gaussian_random_batch_size_like,
+    paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
+        paddle::platform::float16>,
     paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<float>,
     paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<double>);
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
index 6a4a322b327a0..8151d21fa676d 100644
--- a/paddle/fluid/operators/gelu_op.cu
+++ b/paddle/fluid/operators/gelu_op.cu
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/gelu_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -27,9 +26,11 @@ struct GeluWithApproximateFunctor {
     // this function is tanh approximation of gelu
     MPType x = static_cast<MPType>(arg_x);
     MPType one = static_cast<MPType>(1);
-    MPType out = x * static_cast<MPType>(0.5) *
-                 (one + tanh(static_cast<MPType>(0.79788456) * x *
-                             (one + static_cast<MPType>(0.044715) * x * x)));
+    MPType half = static_cast<MPType>(0.5);
+    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
+    auto tanh_out =
+        tanh(kAlpha * x * (one + static_cast<MPType>(GELU_CONSTANT) * x * x));
+    MPType out = x * half * (one + tanh_out);
     return static_cast<T>(out);
   }
 };
@@ -40,10 +41,7 @@ struct GeluWithoutApproximateFunctor {
   inline HOSTDEVICE T operator()(T arg_x) {
     // actual gelu with approximation = false
     MPType x = static_cast<MPType>(arg_x);
-    MPType erf_out = erf(x * static_cast<MPType>(M_SQRT1_2));
-    MPType out =
-        x * static_cast<MPType>(0.5) * (static_cast<MPType>(1) + erf_out);
-    return static_cast<T>(out);
+    return static_cast<T>(x * normcdf(x));
   }
 };
 
@@ -71,6 +69,66 @@ class GeluKernel<platform::CUDADeviceContext, T>
   }
 };
 
+template <typename T>
+struct GeluWithApproximateGradFunctor {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType one = static_cast<MPType>(1);
+    MPType half = static_cast<MPType>(0.5);
+    MPType kAlpha = static_cast<MPType>(M_2_SQRTPI * M_SQRT1_2);
+    MPType kBeta =
+        kAlpha * static_cast<MPType>(GELU_CONSTANT) * static_cast<MPType>(3);
+    auto cube_x = x * x * x;
+    auto tanh_out =
+        tanh(kAlpha * ((static_cast<MPType>(GELU_CONSTANT) * cube_x) + x));
+    auto ans =
+        half * (one + tanh_out +
+                (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x));
+    return static_cast<T>(ans * dout);
+  }
+};
+
+template <typename T>
+struct GeluWithoutApproximateGradFunctor {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType dout = static_cast<MPType>(arg_dout);
+    constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast<MPType>(0.5);
+    const MPType cdf = normcdf(x);
+    const MPType pdf = exp(static_cast<MPType>(-0.5) * x * x) * kBeta;
+    return static_cast<T>(dout * (cdf + x * pdf));
+  }
+};
+
+template <typename T>
+class GeluGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto approximate = context.Attr<bool>("approximate");
+    dx->mutable_data<T>(dout->place());
+
+    std::vector<const framework::Tensor*> ins = {x, dout};
+    std::vector<framework::Tensor*> outs = {dx};
+    const auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    if (approximate) {
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
+    } else {
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h
index 0446d7d284b22..a913b8a111279 100644
--- a/paddle/fluid/operators/gelu_op.h
+++ b/paddle/fluid/operators/gelu_op.h
@@ -30,6 +30,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#define GELU_CONSTANT 0.044715
+
 template <typename T>
 struct GeluFunctor {
   template <typename Device, typename X, typename Out>
@@ -41,14 +43,14 @@ struct GeluFunctor {
         auto casted_x = x.template cast<float>();
         auto temp =
             (static_cast<float>(M_2_SQRTPI * M_SQRT1_2) *
-             (casted_x + static_cast<float>(0.044715) * casted_x.cube()))
+             (casted_x + static_cast<float>(GELU_CONSTANT) * casted_x.cube()))
                 .tanh();
         out.device(d) = (casted_x * static_cast<float>(0.5) *
                          (static_cast<float>(1) + temp))
                             .template cast<T>();
       } else {
         auto temp = (static_cast<T>(M_2_SQRTPI * M_SQRT1_2) *
-                     (x + static_cast<T>(0.044715) * x.cube()))
+                     (x + static_cast<T>(GELU_CONSTANT) * x.cube()))
                         .tanh();
         out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
       }
@@ -101,10 +103,10 @@ struct GeluGradFunctor {
 
         const float kAlpha = static_cast<float>(M_2_SQRTPI * M_SQRT1_2);
         const float kBeta =
-            kAlpha * static_cast<float>(0.044715) * static_cast<float>(3);
+            kAlpha * static_cast<float>(GELU_CONSTANT) * static_cast<float>(3);
         const auto y =
             (kAlpha *
-             ((static_cast<float>(0.044715) * casted_x.cube()) + casted_x))
+             ((static_cast<float>(GELU_CONSTANT) * casted_x.cube()) + casted_x))
                 .tanh();
         dx.device(d) = (static_cast<float>(0.5) * casted_dout *
                         (static_cast<float>(1) + y +
@@ -113,9 +115,10 @@ struct GeluGradFunctor {
                            .template cast<T>();
       } else {
         const T kAlpha = static_cast<T>(M_2_SQRTPI * M_SQRT1_2);
-        const T kBeta = kAlpha * static_cast<T>(0.044715) * static_cast<T>(3);
+        const T kBeta =
+            kAlpha * static_cast<T>(GELU_CONSTANT) * static_cast<T>(3);
         const auto y =
-            (kAlpha * ((static_cast<T>(0.044715) * x.cube()) + x)).tanh();
+            (kAlpha * ((static_cast<T>(GELU_CONSTANT) * x.cube()) + x)).tanh();
         dx.device(d) = static_cast<T>(0.5) * dout *
                        (static_cast<T>(1) + y +
                         (x - x * y.square()) * (kAlpha + kBeta * x.square()));
diff --git a/paddle/fluid/operators/huber_loss_op_xpu.cc b/paddle/fluid/operators/huber_loss_op_xpu.cc
new file mode 100644
index 0000000000000..767ce542736e8
--- /dev/null
+++ b/paddle/fluid/operators/huber_loss_op_xpu.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/huber_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class HuberLossXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in0 = ctx.Input<Tensor>("X");
+    auto* in1 = ctx.Input<Tensor>("Y");
+    auto* residual = ctx.Output<Tensor>("Residual");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto delta = ctx.Attr<float>("delta");
+
+    auto residual_data = residual->mutable_data<T>(ctx.GetPlace());
+    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto in0_data = in0->data<T>();
+    auto in1_data = in1->data<T>();
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    int r = xpu::huber_loss<T>(dev_ctx.x_context(), in0_data, in1_data,
+                               residual_data, out_data, in0->numel(), 1, delta);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
+                                          "XPU API(huber_loss) return wrong "
+                                          "value[%d %s]",
+                                          r, XPUAPIErrorMsg[r]));
+  }
+};
+
+template <typename T>
+class HuberLossGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* residual = ctx.Input<Tensor>("Residual");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto delta = ctx.Attr<float>("delta");
+
+    T* dx_data = nullptr;
+    T* dy_data = nullptr;
+    if (dx) {
+      dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    }
+    if (dy) {
+      dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    }
+    auto dout_data = dout->data<T>();
+    auto residual_data = residual->data<T>();
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    int r =
+        xpu::huber_loss_grad<T>(dev_ctx.x_context(), residual_data, dout_data,
+                                dx_data, dy_data, dout->numel(), 1, delta);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(huber_loss_grad) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(huber_loss, ops::HuberLossXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(huber_loss_grad, ops::HuberLossGradXPUKernel<float>);
+
+#endif
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
index 19355434955a2..ce45ed0301e92 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
@@ -254,8 +254,8 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
       }
     }
   } else {  // blockDim,x * NX < num
-    const int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
-    const int kVectorsPerThread = NX / kVectorSize;
+    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+    constexpr int kVectorsPerThread = NX / kVectorSize;
     int thread_offset = threadIdx.x * kVectorsPerThread;
 
     using VecType = details::VectorType<T, kVectorSize>;
@@ -441,8 +441,8 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
     }
   } else {
     // Vector type
-    const int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
-    const int kVectorsPerThread = NX / kVectorSize;
+    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+    constexpr int kVectorsPerThread = NX / kVectorSize;
 
     int thread_offset = threadIdx.x * kVectorsPerThread;
     using VecType = details::VectorType<T, kVectorSize>;
diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc
new file mode 100644
index 0000000000000..83071e09e3252
--- /dev/null
+++ b/paddle/fluid/operators/kthvalue_op.cc
@@ -0,0 +1,174 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/kthvalue_op.h"
+#include <memory>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class KthvalueOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue");
+    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue");
+    auto input_dims = ctx->GetInputDim("X");
+    const int& dim_size = input_dims.size();
+    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE_LT(axis, dim_size,
+                      paddle::platform::errors::InvalidArgument(
+                          "the axis must be [-%d, %d), but received %d .",
+                          dim_size, dim_size, axis));
+    PADDLE_ENFORCE_GE(axis, -dim_size,
+                      paddle::platform::errors::InvalidArgument(
+                          "the axis must be [-%d, %d), but received %d .",
+                          dim_size, dim_size, axis));
+    if (axis < 0) axis += dim_size;
+    int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+    PADDLE_ENFORCE_GE(
+        k, 1, paddle::platform::errors::InvalidArgument(
+                  "the k in the kthvalue must >= 1, but received %d .", k));
+    PADDLE_ENFORCE_GE(input_dims.size(), 1,
+                      paddle::platform::errors::InvalidArgument(
+                          "input of kthvalue must have >= 1d shape"));
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GE(
+          input_dims[axis], k,
+          paddle::platform::errors::InvalidArgument(
+              "input of kthvalue must have >= %d columns in axis of %d", k,
+              axis));
+    }
+    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
+    std::vector<int64_t> dimvec;
+    for (int64_t i = 0; i < axis; i++) {
+      dimvec.emplace_back(input_dims[i]);
+    }
+    if (keepdim) {
+      dimvec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < dim_size; i++) {
+      dimvec.emplace_back(input_dims[i]);
+    }
+    framework::DDim dims = framework::make_ddim(dimvec);
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class KthvalueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddComment(R"DOC(
+    This operator find the k-th smallest elements in the specific axis of a Tensor.
+    It will return the values and corresponding indices.
+    )DOC");
+    AddInput("X", "(Tensor) The input of Kthvalue op");
+    AddOutput("Out", "(Tensor) The values of k-th smallest elements of input");
+    AddOutput("Indices",
+              "(Tensor) The indices of k-th smallest elements of input");
+    AddAttr<int>(
+        "k",
+        "(int, default 1) k for k-th smallest elements to look for along "
+        "the tensor).")
+        .SetDefault(1);
+    AddAttr<int>("axis",
+                 "the axis to sort and get the k indices, value."
+                 "if not set, will get k-th value in last axis.")
+        .SetDefault(-1);
+    AddAttr<bool>("keepdim", "Keep the dim that to reduce.").SetDefault(false);
+  }
+};
+
+class KthvalueOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument("Input(X) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Indices"), true,
+        platform::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Grad Input(Out) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename T>
+class KthvalueGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("kthvalue_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Output("Indices"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker,
+                  ops::KthvalueGradOpMaker<paddle::framework::OpDesc>,
+                  ops::KthvalueGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    kthvalue, ops::KthvalueCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, double>,
+    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::KthvalueCPUKernel<paddle::platform::CPUPlace, int64_t>);
+
+REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    kthvalue_grad,
+    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, double>,
+    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::KthvalueGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu
new file mode 100644
index 0000000000000..c6c62a763aa06
--- /dev/null
+++ b/paddle/fluid/operators/kthvalue_op.cu
@@ -0,0 +1,279 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/kthvalue_op.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/fluid/operators/top_k_v2_op.h"
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+#endif
+
+namespace paddle {
+namespace operators {
+
+int getBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T>
+bool SortKthvalue(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor* input_tensor, const int64_t num_cols,
+                  const int64_t num_rows, const int k,
+                  framework::Tensor* out_tensor,
+                  framework::Tensor* indices_tensor) {
+  auto cu_stream = ctx.stream();
+  framework::Tensor input_indices;
+  const std::vector<int64_t> dims = {num_rows, num_cols};
+  auto dim = framework::make_ddim(dims);
+  input_indices.Resize(dim);
+  input_indices.mutable_data<int64_t>(ctx.GetPlace());
+  size_t temp_storage_bytes = -1;
+  int block_size = getBlockSize(num_cols);
+  unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x;
+  unsigned int grid_size = num_rows < maxGridDimX
+                               ? static_cast<unsigned int>(num_rows)
+                               : maxGridDimX;
+  InitIndex<int64_t><<<grid_size, block_size, 0, cu_stream>>>(
+      input_indices.data<int64_t>(), num_rows, num_cols);
+  cub::CountingInputIterator<int64_t> counting_iter(0);
+  cub::TransformInputIterator<int64_t, SegmentOffsetIter,
+                              cub::CountingInputIterator<int64_t>>
+      segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols));
+  T* sorted_values_ptr;
+  int64_t* sorted_indices_ptr;
+  framework::Tensor temp_values, temp_indices;
+  const T* input = input_tensor->data<T>();
+  T* values = out_tensor->data<T>();
+  int64_t* indices = indices_tensor->mutable_data<int64_t>(ctx.GetPlace());
+  temp_values.Resize(dim);
+  temp_indices.Resize(dim);
+  sorted_values_ptr = temp_values.mutable_data<T>(ctx.GetPlace());
+  sorted_indices_ptr = temp_indices.mutable_data<int64_t>(ctx.GetPlace());
+  auto err = cub::DeviceSegmentedRadixSort::SortPairs(
+      nullptr, temp_storage_bytes, input, sorted_values_ptr,
+      input_indices.data<int64_t>(), sorted_indices_ptr, num_cols * num_rows,
+      num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
+      cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, status: "
+               << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  framework::Tensor temp_storage;
+  temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
+
+  err = cub::DeviceSegmentedRadixSort::SortPairs(
+      temp_storage.data<uint8_t>(), temp_storage_bytes, input,
+      sorted_values_ptr, input_indices.data<int64_t>(), sorted_indices_ptr,
+      num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1,
+      0, sizeof(T) * 8, cu_stream);
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "hipcub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
+    return false;
+  }
+#else
+  if (err != cudaSuccess) {
+    LOG(ERROR) << "KthvalueOP failed as could not launch "
+                  "cub::DeviceSegmentedRadixSort::SortPairs, "
+               << temp_storage_bytes << ", status: " << cudaGetErrorString(err);
+    return false;
+  }
+#endif
+  auto& dev = *ctx.eigen_device();
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, k - 1};
+  const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, 1};
+  auto e_indices = framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
+  auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
+      static_cast<const framework::Tensor>(temp_indices));
+  std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(1)};
+  dim = framework::make_ddim(odims);
+  auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
+  auto e_tmp_values = framework::EigenMatrix<T>::From(
+      static_cast<const framework::Tensor>(temp_values));
+
+  EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+      dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+  EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+      dev, e_values, e_tmp_values, slice_indices, slice_sizes);
+  return true;
+}
+
+template <typename DeviceContext, typename T>
+class KthvalueOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument(
+            "It must use CUDAPlace, you must check your device set."));
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    int k = static_cast<int>(ctx.Attr<int>("k"));
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
+    const auto& in_dims = input->dims();
+    if (axis < 0) axis += in_dims.size();
+    auto out_dims = output->dims();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    if (axis == in_dims.size() - 1) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      const auto& dev_ctx = ctx.cuda_device_context();
+      PADDLE_ENFORCE_EQ(SortKthvalue<T>(dev_ctx, input, input_width,
+                                        input_height, k, output, indices),
+                        true, platform::errors::External(
+                                  "KthvalueOP: Error when use cub sorting"));
+      return;
+    } else {
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+      if (!keepdim) {
+        std::vector<int> tmp_out_shape;
+        for (int i = 0; i < axis; i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        tmp_out_shape.emplace_back(1);
+        for (int i = axis + 1; i < in_dims.size(); i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        framework::DDim tmp_out_dims = framework::make_ddim(tmp_out_shape);
+        output->Resize(tmp_out_dims);
+        indices->Resize(tmp_out_dims);
+      }
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(in_dims);
+      for (int i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+        trans_out_dims[i] = in_dims[trans[i]];
+      }
+      trans_out_dims[in_dims.size() - 1] = 1;
+      framework::Tensor trans_input;
+      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
+      int ndims = trans.size();
+      const auto& dev_ctx = ctx.cuda_device_context();
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
+                                                   &trans_input, trans);
+      framework::Tensor trans_ind, trans_out;
+      trans_ind.mutable_data<int64_t>(trans_out_dims, ctx.GetPlace());
+      trans_out.mutable_data<T>(trans_out_dims, ctx.GetPlace());
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+      PADDLE_ENFORCE_EQ(
+          SortKthvalue<T>(dev_ctx, &trans_input, input_width, input_height, k,
+                          &trans_out, &trans_ind),
+          true,
+          platform::errors::External("KthvalueOP: Error when use cub sorting"));
+      TransCompute<platform::CUDADeviceContext, int64_t>(
+          ndims, dev_ctx, trans_ind, indices, trans);
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
+                                                   output, trans);
+      if (!keepdim) {
+        output->Resize(out_dims);
+        indices->Resize(out_dims);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class KthvalueOpGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::InvalidArgument(
+            "It must use CUDAPlace, you must check your device set."));
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<framework::Tensor>("Indices");
+    auto* x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int axis = context.Attr<int>("axis");
+    int k = static_cast<int>(context.Attr<int>("k"));
+    const auto& in_dims = x->dims();
+    auto out_dims = indices->dims();
+    if (axis < 0) axis += in_dims.size();
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+    int pre, n, post;
+    GetDims(in_dims, axis, &pre, &n, &post);
+    auto& dev_ctx = context.cuda_device_context();
+    int block_size = getBlockSize(post * k);
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+    int grid_size = std::min(max_blocks, pre);
+    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    kthvalue,
+    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::KthvalueOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    kthvalue_grad,
+    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::KthvalueOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h
new file mode 100644
index 0000000000000..44f5ca1a25818
--- /dev/null
+++ b/paddle/fluid/operators/kthvalue_op.h
@@ -0,0 +1,281 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+template <typename T, typename Type>
+static void getKthvalue(Type input_height, Type input_width, int input_dim,
+                        const framework::Tensor* input, T* t_out,
+                        Type* t_indices, const int& k) {
+  bool partial_sort_flag = (k * 64) < input_width;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = framework::EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    if (partial_sort_flag) {
+      std::partial_sort(
+          col_vec.begin(), col_vec.begin() + k, col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    } else {
+      std::nth_element(
+          col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(),
+          [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+            return (!std::isnan(static_cast<double>(l.first)) &&
+                    std::isnan(static_cast<double>(r.first))) ||
+                   (l.first < r.first);
+          });
+    }
+    t_out[i] = col_vec[k - 1].first;
+    t_indices[i] = col_vec[k - 1].second;
+  }
+}
+
+template <typename T, typename Type>
+static void kthvalueAssign(const Type& input_height, const Type& input_width,
+                           const int& input_dim, const framework::Tensor* input,
+                           const framework::Tensor* indices, T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = framework::EigenVector<T>::Flatten(*input);
+      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices =
+          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class KthvalueCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<framework::Tensor>("X");
+    auto* output = context.Output<framework::Tensor>("Out");
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    const auto& in_dims = input->dims();
+    int k = static_cast<int>(context.Attr<int>("k"));
+    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    if (axis < 0) axis += in_dims.size();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    auto out_dims = output->dims();
+    if (axis == in_dims.size() - 1) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(), input,
+                              output_data, indices_data, k);
+    } else {
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+      if (!keepdim) {
+        std::vector<int> tmp_out_shape;
+        for (int i = 0; i < axis; i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        tmp_out_shape.emplace_back(1);
+        for (int i = axis + 1; i < in_dims.size(); i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        framework::DDim tmp_out_dims = framework::make_ddim(tmp_out_shape);
+        output->Resize(tmp_out_dims);
+        indices->Resize(tmp_out_dims);
+      }
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(in_dims);
+
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = in_dims[trans[i]];
+        trans_out_dims[i] = in_dims[trans[i]];
+      }
+      trans_out_dims[in_dims.size() - 1] = 1;
+      framework::Tensor trans_inp;
+      trans_inp.mutable_data<T>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
+                                                  &trans_inp, trans);
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const int64_t input_width = trans_dims[trans_dims.size() - 1];
+      framework::Tensor tmp_out, tmp_indices;
+      T* t_out = tmp_out.mutable_data<T>(trans_out_dims, context.GetPlace());
+      auto* t_ind =
+          tmp_indices.mutable_data<int64_t>(trans_out_dims, context.GetPlace());
+
+      getKthvalue<T, int64_t>(input_height, input_width, in_dims.size(),
+                              &trans_inp, t_out, t_ind, k);
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, tmp_indices, indices, trans);
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  output, trans);
+      if (!keepdim) {
+        output->Resize(out_dims);
+        indices->Resize(out_dims);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class KthvalueGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<framework::Tensor>("Indices");
+    auto* x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
+    auto in_dims = x->dims();
+    auto out_dims = indices->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(out_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(out_dims[i - 1]);
+      }
+      out_dims = framework::make_ddim(tmp_out_shape);
+    }
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    if (axis == in_dims.size() - 1) {
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t input_width = in_dims[in_dims.size() - 1];
+      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+      if (keepdim) {
+        kthvalueAssign(input_height, input_width, in_dims.size(), out_grad,
+                       indices, x_grad_data);
+      } else {
+        auto& dev_context =
+            context.template device_context<platform::CPUDeviceContext>();
+        framework::Tensor out_grad_tmp, indices_tmp;
+        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
+        indices_tmp.mutable_data<int64_t>(indices->dims(),
+                                          dev_context.GetPlace());
+        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
+                              &out_grad_tmp);
+        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
+                              &indices_tmp);
+        out_grad_tmp.Resize(out_dims);
+        indices_tmp.Resize(out_dims);
+        kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
+                       &indices_tmp, x_grad_data);
+      }
+    } else {
+      std::vector<int> trans;
+      for (int i = 0; i < axis; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(out_dims.size() - 1);
+      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+        trans.emplace_back(i);
+      }
+      trans.emplace_back(axis);
+      framework::DDim trans_dims(out_dims);
+      framework::DDim trans_in_dims(in_dims);
+      for (size_t i = 0; i < trans.size(); i++) {
+        trans_dims[i] = out_dims[trans[i]];
+        trans_in_dims[i] = in_dims[trans[i]];
+      }
+      framework::Tensor trans_dO, trans_ind;
+      trans_dO.mutable_data<T>(trans_dims, context.GetPlace());
+      trans_ind.mutable_data<int64_t>(trans_dims, context.GetPlace());
+      int ndims = trans.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+      if (keepdim) {
+        TransCompute<platform::CPUDeviceContext, T>(
+            ndims, dev_context, *out_grad, &trans_dO, trans);
+        TransCompute<platform::CPUDeviceContext, int64_t>(
+            ndims, dev_context, *indices, &trans_ind, trans);
+      } else {
+        framework::Tensor out_grad_tmp, indices_tmp;
+        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
+        indices_tmp.mutable_data<int64_t>(indices->dims(),
+                                          dev_context.GetPlace());
+        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
+                              &out_grad_tmp);
+        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
+                              &indices_tmp);
+        out_grad_tmp.Resize(out_dims);
+        indices_tmp.Resize(out_dims);
+        TransCompute<platform::CPUDeviceContext, T>(
+            ndims, dev_context, out_grad_tmp, &trans_dO, trans);
+        TransCompute<platform::CPUDeviceContext, int64_t>(
+            ndims, dev_context, indices_tmp, &trans_ind, trans);
+      }
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1));
+      const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1];
+      framework::Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_in_dims, context.GetPlace());
+      memset(t_out, 0, x_grad->numel() * sizeof(T));
+      kthvalueAssign<T, int64_t>(input_height, input_width, in_dims.size(),
+                                 &trans_dO, &trans_ind, t_out);
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  x_grad, trans);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu
index c94a37f03f2b7..2e7d1de3bd756 100644
--- a/paddle/fluid/operators/label_smooth_op.cu
+++ b/paddle/fluid/operators/label_smooth_op.cu
@@ -13,19 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/label_smooth_op.h"
 namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void LabelSmoothRunOriginKernel(const int N, const float epsilon,
-                                           const int label_dim, const T* src,
-                                           T* dst) {
-  CUDA_KERNEL_LOOP(idx, N) {
-    dst[idx] = static_cast<T>(1 - epsilon) * src[idx] +
-               static_cast<T>(epsilon / label_dim);
+struct LabelSmoothFunctor {
+  T epsilon;
+  T label_dim;
+
+  __forceinline__ LabelSmoothFunctor(float epsilon_data, int label_dim_data) {
+    epsilon = static_cast<T>(epsilon_data);
+    label_dim = static_cast<T>(label_dim_data);
   }
-}
+
+  __device__ __forceinline__ T operator()(const T& x) const {
+    return (static_cast<T>(1 - epsilon) * x +
+            static_cast<T>(epsilon / label_dim));
+  }
+};
+
+template <typename T>
+struct LabelSmoothGradFunctor {
+  T epsilon;
+
+  __forceinline__ LabelSmoothGradFunctor(float epsilon_data) {
+    epsilon = static_cast<T>(epsilon_data);
+  }
+
+  __device__ __forceinline__ T operator()(const T& x) const {
+    return static_cast<T>(1 - epsilon) * x;
+  }
+};
 
 template <typename T>
 __global__ void LabelSmoothRunDistKernel(const int N, const float epsilon,
@@ -38,14 +58,6 @@ __global__ void LabelSmoothRunDistKernel(const int N, const float epsilon,
   }
 }
 
-template <typename T>
-__global__ void LabelSmoothGradRunKernel(const int N, const float epsilon,
-                                         const T* src, T* dst) {
-  CUDA_KERNEL_LOOP(idx, N) {
-    dst[idx] = static_cast<T>(1 - epsilon) * src[idx];
-  }
-}
-
 template <typename DeviceContext, typename T>
 class LabelSmoothGPUKernel : public framework::OpKernel<T> {
  public:
@@ -69,8 +81,14 @@ class LabelSmoothGPUKernel : public framework::OpKernel<T> {
           size_prob, epsilon, dist_numel, in_data, dist_data, out_data);
 
     } else {
-      LabelSmoothRunOriginKernel<T><<<grid, threads, 0, stream>>>(
-          size_prob, epsilon, label_dim, in_data, out_data);
+      auto& dev_ctx =
+          ctx.template device_context<platform::CUDADeviceContext>();
+
+      std::vector<const framework::Tensor*> ins = {in_t};
+      std::vector<framework::Tensor*> outs = {out_t};
+      auto functor = LabelSmoothFunctor<T>(epsilon, label_dim);
+      LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+          dev_ctx, ins, &outs, functor);
     }
   }
 };
@@ -84,15 +102,13 @@ class LabelSmoothGradGPUKernel : public framework::OpKernel<T> {
     d_in_t->mutable_data<T>(ctx.GetPlace());
 
     auto epsilon = ctx.Attr<float>("epsilon");
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    const T* in_data = d_out_t->data<T>();
-    auto size_prob = d_out_t->numel();
-    T* out_data = d_in_t->mutable_data<T>(ctx.GetPlace());
-    int threads = 512;
-    int grid = (size_prob + threads - 1) / threads;
-    auto stream = ctx.cuda_device_context().stream();
-    LabelSmoothGradRunKernel<T><<<grid, threads, 0, stream>>>(
-        size_prob, epsilon, in_data, out_data);
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    std::vector<const framework::Tensor*> ins = {d_out_t};
+    std::vector<framework::Tensor*> outs = {d_in_t};
+    auto functor = LabelSmoothGradFunctor<T>(epsilon);
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+        dev_ctx, ins, &outs, functor);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 3fe453bda2d9e..7725f336416db 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -64,8 +64,8 @@ class LayerNormKernel<platform::CUDADeviceContext, T>
     auto *mean_data = mean->mutable_data<U>(ctx.GetPlace());
     auto *var_data = var->mutable_data<U>(ctx.GetPlace());
 
-    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data<void>());
-    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data<void>());
+    auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
+    auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
 
     framework::proto::VarType::Type x_dtype = x->type();
     framework::proto::VarType::Type scale_bias_dtype;
diff --git a/paddle/fluid/operators/lstsq_op.cc b/paddle/fluid/operators/lstsq_op.cc
new file mode 100644
index 0000000000000..65fe99e2ead2e
--- /dev/null
+++ b/paddle/fluid/operators/lstsq_op.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/lstsq_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class LstsqOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "LstsqOp");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "LstsqOp");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Solution"), "Output", "Solution", "LstsqOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Rank"), "Output", "Rank", "LstsqOp");
+    OP_INOUT_CHECK(ctx->HasOutput("SingularValues"), "Output", "SingularValues",
+                   "LstsqOp");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    int x_rank = x_dims.size();
+    int y_rank = y_dims.size();
+
+    PADDLE_ENFORCE_GE(x_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "Expects input tensor x to be not less than "
+                          "2 dimentions, but got dimention %d",
+                          x_rank));
+    PADDLE_ENFORCE_GE(y_rank, 2,
+                      platform::errors::InvalidArgument(
+                          "Expects input tensor y to be not less than "
+                          "2 dimentions, but got dimention %d",
+                          y_rank));
+
+    PADDLE_ENFORCE_EQ(
+        x_rank, y_rank,
+        platform::errors::InvalidArgument(
+            "Expects input tensor x and y to have the same dimension "
+            "but got x's dimention [%d] and y's dimention [%d]",
+            x_rank, y_rank));
+
+    std::vector<int> batch_dims_vec{};
+    for (int i = 0; i < x_rank - 2; ++i) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[i], y_dims[i],
+          platform::errors::InvalidArgument(
+              "Expects input tensor x and y to have the same batch "
+              "dimension, but got x's batch dimention [%d] and "
+              "y's batch dimention [%d] in %d-th dim",
+              x_dims[i], y_dims[i], i));
+      batch_dims_vec.emplace_back(x_dims[i]);
+    }
+
+    PADDLE_ENFORCE_EQ(
+        x_dims[x_rank - 2], y_dims[y_rank - 2],
+        platform::errors::InvalidArgument(
+            "Expects input tensor x and y to have the same row dimension "
+            "of the inner-most 2-dims matrix, "
+            "but got x's row dimention [%d] and y's row dimention [%d]",
+            x_dims[x_rank - 2], y_dims[y_rank - 2]));
+
+    ctx->SetOutputDim("Rank", framework::make_ddim(batch_dims_vec));
+
+    batch_dims_vec.emplace_back(
+        std::min(x_dims[x_rank - 2], x_dims[x_rank - 1]));
+    ctx->SetOutputDim("SingularValues", framework::make_ddim(batch_dims_vec));
+
+    batch_dims_vec[x_rank - 2] = x_dims[x_rank - 1];
+    batch_dims_vec.emplace_back(y_dims[x_rank - 1]);
+    ctx->SetOutputDim("Solution", framework::make_ddim(batch_dims_vec));
+  }
+
+ protected:
+  // The output of lstsq is always complex-valued even for real-valued inputs
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    if (dtype != framework::proto::VarType::FP32 &&
+        dtype != framework::proto::VarType::FP64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported data type: %s!", dtype));
+    }
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+};
+
+class LstsqOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), A real-valued tensor with shape (*, m, n). "
+             "The accepted datatype is one of float32, float64");
+    AddInput("Y",
+             "(Tensor), A real-valued tensor with shape (*, m, k). "
+             "The accepted datatype is one of float32, float64");
+    AddAttr<float>(
+        "rcond",
+        "(float, default 0.0), A float value used to determine the effective "
+        "rank of A.")
+        .SetDefault(0.0f);
+    AddAttr<std::string>("driver",
+                         "(string, default \"gels\"). "
+                         "name of the LAPACK method to be used.")
+        .SetDefault("gels");
+    AddOutput("Solution",
+              "(Tensor), The output Solution tensor with shape (*, n, k).");
+    AddOutput("Rank", "(Tensor), The output Rank tensor with shape (*).");
+    AddOutput(
+        "SingularValues",
+        "(Tensor), The output SingularValues tensor with shape (*, min(m,n)).");
+    AddComment(R"DOC(
+        Lstsq Operator.
+This API processes Lstsq functor for general matrices.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lstsq, ops::LstsqOp, ops::LstsqOpMaker)
+
+REGISTER_OP_CPU_KERNEL(
+    lstsq, ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, double>);
\ No newline at end of file
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
new file mode 100644
index 0000000000000..b9c5c87a6a376
--- /dev/null
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -0,0 +1,229 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <math.h>
+#include <algorithm>
+#include <complex>
+#include "paddle/fluid/operators/eig_op.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/operators/math/eigen_values_vectors.h"
+#include "paddle/fluid/operators/math/lapack_function.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matrix_solve.h"
+#include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/fluid/operators/triangular_solve_op.h"
+#include "paddle/fluid/platform/for_range.h"
+
+#define EPSILON 1e-6
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+enum class LapackDriverType : int { Gels, Gelsd, Gelsy, Gelss };
+
+using DDim = framework::DDim;
+static DDim UDDim(const DDim& x_dim) {
+  auto x_vec = vectorize(x_dim);
+  return framework::make_ddim(x_vec);
+}
+
+template <typename DeviceContext, typename T>
+class LstsqCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using ValueType = math::Real<T>;
+
+    const Tensor& x = *context.Input<Tensor>("X");
+    const Tensor& y = *context.Input<Tensor>("Y");
+    auto rcond = context.Attr<float>("rcond");
+    auto driver_string = context.Attr<std::string>("driver");
+
+    static auto driver_type = std::unordered_map<std::string, LapackDriverType>(
+        {{"gels", LapackDriverType::Gels},
+         {"gelsy", LapackDriverType::Gelsy},
+         {"gelsd", LapackDriverType::Gelsd},
+         {"gelss", LapackDriverType::Gelss}});
+    auto driver = driver_type[driver_string];
+
+    auto solution = context.Output<Tensor>("Solution");
+    auto* rank = context.Output<Tensor>("Rank");
+    auto* singular_values = context.Output<Tensor>("SingularValues");
+
+    auto dito =
+        math::DeviceIndependenceTensorOperations<DeviceContext, T>(context);
+
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    int dim_size = x_dims.size();
+    int x_stride = MatrixStride(x);
+    int y_stride = MatrixStride(y);
+    int batch_count = BatchCount(x);
+    auto ori_solution_dim = solution->dims();
+    int ori_solu_stride = MatrixStride(*solution);
+
+    // lapack is a column-major storge, transpose make the input to
+    // have a continuous memory layout
+    int info = 0;
+    int m = x_dims[dim_size - 2];
+    int n = x_dims[dim_size - 1];
+    int nrhs = y_dims[dim_size - 1];
+    int lda = std::max<int>(m, 1);
+    int ldb = std::max<int>(1, std::max(m, n));
+
+    Tensor new_x;
+    new_x.mutable_data<T>(context.GetPlace(),
+                          size_t(batch_count * m * n * sizeof(T)));
+    solution->mutable_data<T>(
+        context.GetPlace(),
+        size_t(batch_count * std::max(m, n) * nrhs * sizeof(T)));
+    framework::TensorCopy(x, context.GetPlace(), &new_x);
+    framework::TensorCopy(y, context.GetPlace(), solution);
+
+    if (m < n) solution->Resize(UDDim(ori_solution_dim));
+
+    Tensor input_x_trans = dito.Transpose(new_x);
+    Tensor input_y_trans = dito.Transpose(*solution);
+    framework::TensorCopy(input_x_trans, new_x.place(), &new_x);
+    framework::TensorCopy(input_y_trans, solution->place(), solution);
+
+    auto* x_vector = new_x.data<T>();
+    auto* y_vector = solution->data<T>();
+
+    // "gels" divers does not need to compute rank
+    int rank_32 = 0;
+    int* rank_data = nullptr;
+    int* rank_working_ptr = nullptr;
+    if (driver != LapackDriverType::Gels) {
+      rank_data = rank->mutable_data<int>(context.GetPlace());
+      rank_working_ptr = rank_data;
+    }
+
+    // "gelsd" and "gelss" divers need to compute singular values
+    ValueType* s_data = nullptr;
+    ValueType* s_working_ptr = nullptr;
+    int s_stride = 0;
+    if (driver == LapackDriverType::Gelsd ||
+        driver == LapackDriverType::Gelss) {
+      s_data = singular_values->mutable_data<ValueType>(context.GetPlace());
+      s_working_ptr = s_data;
+      auto s_dims = singular_values->dims();
+      s_stride = s_dims[s_dims.size() - 1];
+    }
+
+    // "jpvt" is only used for "gelsy" driver
+    Tensor jpvt;
+    int* jpvt_data = nullptr;
+    if (driver == LapackDriverType::Gelsy) {
+      jpvt.Resize(framework::make_ddim({std::max<int>(1, n)}));
+      jpvt_data = jpvt.mutable_data<int>(context.GetPlace());
+    }
+
+    // run once the driver, first to get the optimal workspace size
+    int lwork = -1;
+    T wkopt;
+    ValueType rwkopt;
+    int iwkopt = 0;
+
+    if (driver == LapackDriverType::Gels) {
+      math::lapackGels('N', m, n, nrhs, x_vector, lda, y_vector, ldb, &wkopt,
+                       lwork, &info);
+    } else if (driver == LapackDriverType::Gelsd) {
+      math::lapackGelsd(m, n, nrhs, x_vector, lda, y_vector, ldb, s_working_ptr,
+                        static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork,
+                        &rwkopt, &iwkopt, &info);
+    } else if (driver == LapackDriverType::Gelsy) {
+      math::lapackGelsy(m, n, nrhs, x_vector, lda, y_vector, ldb, jpvt_data,
+                        static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork,
+                        &rwkopt, &info);
+    } else if (driver == LapackDriverType::Gelss) {
+      math::lapackGelss(m, n, nrhs, x_vector, lda, y_vector, ldb, s_working_ptr,
+                        static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork,
+                        &rwkopt, &info);
+    }
+
+    lwork = std::max<int>(1, static_cast<int>(math::Real<T>(wkopt)));
+    Tensor work;
+    work.Resize(framework::make_ddim({lwork}));
+    T* work_data = work.mutable_data<T>(context.GetPlace());
+
+    // "rwork" only used for complex inputs and "gelsy/gelsd/gelss" drivers
+    Tensor rwork;
+    ValueType* rwork_data = nullptr;
+    if (framework::IsComplexType(x.type()) &&
+        driver != LapackDriverType::Gels) {
+      int rwork_len = 0;
+      if (driver == LapackDriverType::Gelsy) {
+        rwork_len = std::max<int>(1, 2 * n);
+      } else if (driver == LapackDriverType::Gelss) {
+        rwork_len = std::max<int>(1, 5 * std::min(m, n));
+      } else if (driver == LapackDriverType::Gelsd) {
+        rwork_len = std::max<int>(1, rwkopt);
+      }
+      rwork.Resize(framework::make_ddim({rwork_len}));
+      rwork_data = rwork.mutable_data<ValueType>(context.GetPlace());
+    }
+
+    // "iwork" workspace array is relavant only for "gelsd" driver
+    Tensor iwork;
+    int* iwork_data = nullptr;
+    if (driver == LapackDriverType::Gelsd) {
+      iwork.Resize(framework::make_ddim({std::max<int>(1, iwkopt)}));
+      iwork_data = iwork.mutable_data<int>(context.GetPlace());
+    }
+
+    int solu_stride = std::max(y_stride, ori_solu_stride);
+    for (auto i = 0; i < batch_count; ++i) {
+      auto* x_input = &x_vector[i * x_stride];
+      auto* y_input = &y_vector[i * solu_stride];
+      rank_working_ptr = rank_working_ptr ? &rank_data[i] : nullptr;
+      s_working_ptr = s_working_ptr ? &s_data[i * s_stride] : nullptr;
+
+      if (driver == LapackDriverType::Gels) {
+        math::lapackGels('N', m, n, nrhs, x_input, lda, y_input, ldb, work_data,
+                         lwork, &info);
+      } else if (driver == LapackDriverType::Gelsd) {
+        math::lapackGelsd(m, n, nrhs, x_input, lda, y_input, ldb, s_working_ptr,
+                          static_cast<ValueType>(rcond), &rank_32, work_data,
+                          lwork, rwork_data, iwork_data, &info);
+      } else if (driver == LapackDriverType::Gelsy) {
+        math::lapackGelsy(m, n, nrhs, x_input, lda, y_input, ldb, jpvt_data,
+                          static_cast<ValueType>(rcond), &rank_32, work_data,
+                          lwork, rwork_data, &info);
+      } else if (driver == LapackDriverType::Gelss) {
+        math::lapackGelss(m, n, nrhs, x_input, lda, y_input, ldb, s_working_ptr,
+                          static_cast<ValueType>(rcond), &rank_32, work_data,
+                          lwork, rwork_data, &info);
+      }
+
+      PADDLE_ENFORCE_EQ(
+          info, 0,
+          platform::errors::PreconditionNotMet(
+              "For batch [%d]: Lapack info is not zero but [%d]", i, info));
+
+      if (rank_working_ptr) *rank_working_ptr = static_cast<int>(rank_32);
+    }
+
+    Tensor tmp_s = dito.Transpose(*solution);
+    framework::TensorCopy(tmp_s, solution->place(), solution);
+
+    if (m >= n) solution->Resize(UDDim(ori_solution_dim));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc
new file mode 100644
index 0000000000000..aff6a77762fa3
--- /dev/null
+++ b/paddle/fluid/operators/lu_op.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lu_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddComment(R"DOC(LU decomposition, 
+                Computes the LU factorization of a matrix or batches of matrices A.
+                )DOC");
+    AddInput("X", "(Tensor) The input tensor, shape of (*,m,n)");
+    AddOutput("Out", "(Tensor) The output tensor, shape same to X");
+    AddOutput("Pivots",
+              "Stores all the intermediate transpositions of rows. shape of "
+              "(*,min(m,n))");
+    AddOutput("Infos",
+              "(Tensor) This is a tensor of size (*) where non-zero values "
+              "indicate whether factorization for the matrix has succeeded");
+    AddAttr<bool>("pivots", "Whether pivoting is done").SetDefault(true);
+  }
+};
+
+class LUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "LU");
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "LU");
+    bool pivots = context->Attrs().Get<bool>("pivots");
+    auto x_dims = context->GetInputDim("X");
+    int x_rank = x_dims.size();
+    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
+                                     "the rank of input must greater than 2"));
+    context->SetOutputDim("Out", x_dims);
+    int m = x_dims[x_rank - 1];
+    int n = x_dims[x_rank - 2];
+    int min_mn = std::min(m, n);
+    auto dims_vec = framework::vectorize(x_dims);
+    OP_INOUT_CHECK(context->HasOutput("Infos"), "Output", "Infos", "LU");
+    if (x_rank == 2) {
+      auto Infos_dim = std::vector<int>(1);
+      context->SetOutputDim("Infos", framework::make_ddim(Infos_dim));
+    } else {
+      auto Infos_dim =
+          std::vector<int>(dims_vec.begin(), dims_vec.begin() + x_rank - 2);
+      context->SetOutputDim("Infos", framework::make_ddim(Infos_dim));
+    }
+    if (pivots) {
+      OP_INOUT_CHECK(context->HasOutput("Pivots"), "Output", "Pivots", "LU");
+      auto Pivots_dim =
+          std::vector<int>(dims_vec.begin(), dims_vec.begin() + x_rank - 1);
+      Pivots_dim[x_rank - 2] = min_mn;
+      context->SetOutputDim("Pivots", framework::make_ddim(Pivots_dim));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class LUOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+
+    ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS);
+
+    ctx->SetOutputType("Pivots", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Pivots", framework::proto::VarType::INT32,
+                           framework::ALL_ELEMENTS);
+
+    ctx->SetOutputType("Infos", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Infos", framework::proto::VarType::INT32,
+                           framework::ALL_ELEMENTS);
+  }
+};
+
+template <typename T>
+class LUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto pivots = ctx.Attr<bool>("pivots");
+    auto *xin = ctx.Input<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    auto *IpivT = ctx.Output<framework::Tensor>("Pivots");
+    auto *InfoT = ctx.Output<framework::Tensor>("Infos");
+    PADDLE_ENFORCE_EQ(pivots, true,
+                      platform::errors::InvalidArgument(
+                          "lu without pivoting is not implemented on the CPU, "
+                          "but got pivots=False"));
+
+    math::DeviceIndependenceTensorOperations<paddle::platform::CPUDeviceContext,
+                                             T>
+        helper(ctx);
+    *out = helper.Transpose(*xin);
+
+    auto outdims = out->dims();
+    auto outrank = outdims.size();
+
+    int m = static_cast<int>(outdims[outrank - 1]);
+    int n = static_cast<int>(outdims[outrank - 2]);
+    int lda = std::max(1, m);
+
+    auto ipiv_dims = slice_ddim(outdims, 0, outrank - 1);
+    ipiv_dims[outrank - 2] = std::min(m, n);
+    IpivT->Resize(ipiv_dims);
+    auto ipiv_data = IpivT->mutable_data<int>(ctx.GetPlace());
+
+    auto info_dims = slice_ddim(outdims, 0, outrank - 2);
+    if (info_dims.size() == 0) {
+      info_dims = framework::make_ddim({1});
+    }
+    InfoT->Resize(info_dims);
+    auto info_data = InfoT->mutable_data<int>(ctx.GetPlace());
+
+    auto batchsize = product(info_dims);
+    batchsize = std::max(static_cast<int>(batchsize), 1);
+    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+    for (int b = 0; b < batchsize; b++) {
+      auto out_data_item = &out_data[b * m * n];
+      int *info_data_item = &info_data[b];
+      int *ipiv_data_item = &ipiv_data[b * std::min(m, n)];
+      math::lapackLu<T>(m, n, out_data_item, lda, ipiv_data_item,
+                        info_data_item);
+    }
+    *out = helper.Transpose(*out);
+  }
+};
+
+template <typename T>
+class LUOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("lu_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Out", this->Output("Out"));
+    retv->SetInput("Pivots", this->Output("Pivots"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class LUGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+
+    ctx->SetOutputType(framework::GradVarName("X"), var_type,
+                       framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType(framework::GradVarName("X"), data_type,
+                           framework::ALL_ELEMENTS);
+  }
+};
+
+class LUGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lu");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "lu");
+    OP_INOUT_CHECK(ctx->HasInput("Pivots"), "Input", "Pivots", "lu");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "lu");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(LUOpInplaceInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(LUGradOpInplaceInferer,
+                           {framework::GradVarName("Out"),
+                            framework::GradVarName("X")});
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(lu, ops::LUOp, ops::LUOpMaker, ops::LUOpVarTypeInference,
+                  ops::LUOpGradMaker<paddle::framework::OpDesc>,
+                  ops::LUOpGradMaker<paddle::imperative::OpBase>,
+                  ops::LUOpInplaceInferer);
+REGISTER_OPERATOR(lu_grad, ops::LUGradOp, ops::LUGradOpVarTypeInference,
+                  ops::LUGradOpInplaceInferer);
+
+REGISTER_OP_CPU_KERNEL(lu, ops::LUKernel<float>, ops::LUKernel<double>);
+REGISTER_OP_CPU_KERNEL(lu_grad,
+                       ops::LUGradKernel<plat::CPUDeviceContext, float>,
+                       ops::LUGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lu_op.cu b/paddle/fluid/operators/lu_op.cu
new file mode 100644
index 0000000000000..f395b39c17ea9
--- /dev/null
+++ b/paddle/fluid/operators/lu_op.cu
@@ -0,0 +1,159 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/lu_op.h"
+#include "paddle/fluid/platform/dynload/cusolver.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void cusolver_bufferSize(const cusolverDnHandle_t& cusolverH, int m, int n,
+                         T* d_A, int lda, int* lwork);
+template <typename T>
+void cusolver_getrf(const cusolverDnHandle_t& cusolverH, int m, int n, T* d_A,
+                    int lda, T* d_work, int* d_Ipiv, int* d_info);
+
+template <>
+void cusolver_bufferSize<float>(const cusolverDnHandle_t& cusolverH, int m,
+                                int n, float* d_A, int lda, int* lwork) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgetrf_bufferSize(
+      cusolverH, m, n, d_A, lda, lwork));
+}
+
+template <>
+void cusolver_bufferSize<double>(const cusolverDnHandle_t& cusolverH, int m,
+                                 int n, double* d_A, int lda, int* lwork) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgetrf_bufferSize(
+      cusolverH, m, n, d_A, lda, lwork));
+}
+
+template <>
+void cusolver_getrf<float>(const cusolverDnHandle_t& cusolverH, int m, int n,
+                           float* d_A, int lda, float* d_work, int* d_Ipiv,
+                           int* d_info) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgetrf(
+      cusolverH, m, n, d_A, lda, d_work, d_Ipiv, d_info));
+}
+
+template <>
+void cusolver_getrf<double>(const cusolverDnHandle_t& cusolverH, int m, int n,
+                            double* d_A, int lda, double* d_work, int* d_Ipiv,
+                            int* d_info) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgetrf(
+      cusolverH, m, n, d_A, lda, d_work, d_Ipiv, d_info));
+}
+
+template <typename T>
+void lu_decomposed_kernel(int m, int n, T* d_A, int lda, int* d_Ipiv,
+                          int* d_info, const framework::ExecutionContext& ctx) {
+  /* step 1: get cusolver handle*/
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto cusolverH = dev_ctx.cusolver_dn_handle();
+
+  /* step 2: query working space of getrf */
+  int lwork;
+  cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork);
+
+  auto work_buff = memory::Alloc(dev_ctx, lwork * sizeof(T));
+  T* d_work = reinterpret_cast<T*>(work_buff->ptr());
+
+  /* step 3: LU factorization */
+  if (d_Ipiv) {
+    cusolver_getrf(cusolverH, m, n, d_A, lda, d_work, d_Ipiv, d_info);
+  } else {
+    cusolver_getrf(cusolverH, m, n, d_A, lda, d_work, NULL, d_info);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+}
+
+template <typename T>
+class LUCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifdef __HIPCC__
+    const int64_t kMaxBlockDim = 256;
+#else
+    const int64_t kMaxBlockDim = 512;
+#endif
+    auto* xin = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* IpivT = ctx.Output<framework::Tensor>("Pivots");
+    auto* InfoT = ctx.Output<framework::Tensor>("Infos");
+    auto pivots = ctx.Attr<bool>("pivots");
+
+    math::DeviceIndependenceTensorOperations<
+        paddle::platform::CUDADeviceContext, T>
+        helper(ctx);
+    *out = helper.Transpose(*xin);
+
+    auto outdims = out->dims();
+    auto outrank = outdims.size();
+
+    int m = static_cast<int>(outdims[outrank - 1]);
+    int n = static_cast<int>(outdims[outrank - 2]);
+    int lda = std::max(1, m);
+    if (pivots) {
+      auto ipiv_dims = slice_ddim(outdims, 0, outrank - 1);
+      ipiv_dims[outrank - 2] = std::min(m, n);
+      IpivT->Resize(ipiv_dims);
+    }
+    auto ipiv_data = IpivT->mutable_data<int>(ctx.GetPlace());
+
+    auto info_dims = slice_ddim(outdims, 0, outrank - 2);
+    if (info_dims.size() == 0) {
+      info_dims = framework::make_ddim({1});
+    }
+    InfoT->Resize(info_dims);
+    auto info_data = InfoT->mutable_data<int>(ctx.GetPlace());
+
+    auto batchsize = product(info_dims);
+    batchsize = std::max(static_cast<int>(batchsize), 1);
+    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+    for (int b = 0; b < batchsize; b++) {
+      auto out_data_item = &out_data[b * m * n];
+      int* info_data_item = &info_data[b];
+      if (pivots) {
+        auto ipiv_data_item = &ipiv_data[b * std::min(m, n)];
+        lu_decomposed_kernel(m, n, out_data_item, lda, ipiv_data_item,
+                             info_data_item, ctx);
+      } else {
+        lu_decomposed_kernel(m, n, out_data_item, lda, NULL, info_data_item,
+                             ctx);
+      }
+    }
+    *out = helper.Transpose(*out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(lu, ops::LUCUDAKernel<float>,
+                        ops::LUCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lu_grad,
+                        ops::LUGradKernel<plat::CUDADeviceContext, float>,
+                        ops::LUGradKernel<plat::CUDADeviceContext, double>);
+
+#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
new file mode 100644
index 0000000000000..21839c263e4a8
--- /dev/null
+++ b/paddle/fluid/operators/lu_op.h
@@ -0,0 +1,699 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/lapack_function.h"
+#include "paddle/fluid/operators/set_value_op.h"
+#include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/fluid/operators/triangular_solve_op.h"
+#include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/pten/kernels/math_kernel.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+template <typename DeviceContext, typename T, size_t D>
+void SetValueCompute(const framework::ExecutionContext& ctx,
+                     framework::Tensor* in, framework::Tensor* value_tensor,
+                     framework::Tensor* out, const std::vector<int64_t>& axes,
+                     std::vector<int64_t>* starts, std::vector<int64_t>* ends,
+                     const std::vector<int64_t>& shape) {
+  std::vector<int64_t> steps = {1, 1};
+  std::vector<int64_t> decrease_axes = {};
+  std::vector<int64_t> none_axes = {};
+
+  auto dtype = in->type();
+
+  auto in_dims = in->dims();
+  CheckAndUpdateSliceAttrs<int64_t>(in_dims, axes, starts, ends, &steps);
+  auto slice_dims = GetSliceDims(in_dims, axes, *starts, *ends, &steps);
+  auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
+
+  auto slice_dims_for_assign = decrease_slice_dims;
+  if (!none_axes.empty()) {
+    std::vector<int64_t> slice_dims_with_none;
+
+    size_t none_axes_cur = 0, decrease_axes_cur = 0;
+    for (int i = 0; i < slice_dims.size(); ++i) {
+      while (none_axes_cur < none_axes.size() &&
+             none_axes[none_axes_cur] <= i) {
+        slice_dims_with_none.push_back(1);
+        none_axes_cur++;
+      }
+      if (decrease_axes_cur < decrease_axes.size() &&
+          decrease_axes[decrease_axes_cur] == i) {
+        decrease_axes_cur++;
+      } else {
+        slice_dims_with_none.push_back(slice_dims[i]);
+      }
+    }
+    while (none_axes_cur < none_axes.size()) {
+      slice_dims_with_none.push_back(1);
+      none_axes_cur++;
+    }
+
+    slice_dims_for_assign = framework::make_ddim(slice_dims_with_none);
+  }
+
+  auto place = ctx.GetPlace();
+  auto& eigen_place =
+      *ctx.template device_context<DeviceContext>().eigen_device();
+
+  // Here copy data from input to avoid data loss at PE and Graph level.
+  // TODO(liym27): Speed up in the future version.
+  // - Q: Why don't call ShareDataWith to speed up?
+  // - A: Because it's not supported to ShareDataWith on OP's input and output
+  // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
+  // - Q: Why don't delete Input, after all, the input and output are the same
+  // Tensor at program level?
+  // - A: If deleting Input, the graph will be complex, such as there will
+  // be two ops points to the output in graph: op1 -> output <- set_value.
+  // In this case, we have to find a way to handle the running order of
+  // set_value is what we want.
+  TensorCopy(*in, place, out);
+
+  Tensor slice_tensor(dtype), pad_tensor(dtype);
+  slice_tensor.mutable_data<T>(slice_dims, place);
+  pad_tensor.mutable_data<T>(in_dims, place);
+
+  auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
+  auto out_e = framework::EigenTensor<T, D>::From(*out);
+  auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
+
+  // Step 1: Set the value of out at `_index` to zero
+  slice_e.device(eigen_place) = slice_e.constant(T(0));
+
+  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
+  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
+  auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
+
+  for (size_t i = 0; i < D; ++i) {
+    starts_indices[i] = 0;
+    ends_indices[i] = slice_dims[i];
+    strides_indices[i] = 1;
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int axis_index = axes[i];
+    starts_indices[axis_index] = (*starts)[i];
+    ends_indices[axis_index] = (*ends)[i];
+    strides_indices[axis_index] = steps[i];
+    if ((*starts)[i] ==
+        (*ends)[i]) {  // slice is empty, data will not be changed
+      return;
+    }
+  }
+
+  out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 2: Set a tensor with the same shape as out tensor. And its data at
+  // '_index' is the same as value_tensor, and data out of '_index' to zero
+
+  // - Step 2.1 Set slice tensor with value
+
+  // NOTE(liym27): [ Why resize slice_tensor here? ]
+  // A: When do broadcasting on slice_tensor and value_tensor, the shape of
+  // slice_tensor should be decreased dims.
+  // e.g.
+  //  x[:,0] = value_tensor
+  // x's shape = [3, 4], value_tensor's shape = [3]
+  // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
+  // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
+  // shape is [3, 3], which cross the border;
+  // If do broadcasting on Tensor with shape [3] and [3], the result's shape
+  // is [3], which is right.
+
+  slice_tensor.Resize(slice_dims_for_assign);
+  if (value_tensor != nullptr) {
+    CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims());
+    // ElementwiseComputeEx can do broadcasting
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+        ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
+  } else {
+    Tensor value_t(dtype);
+    auto value_dims = framework::make_ddim(shape);
+    CheckIsDimsMatch(slice_dims_for_assign, value_dims);
+
+    value_t.mutable_data<T>(value_dims, place);
+    auto value_name = GetValueName(dtype);
+    CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
+    value_t.Resize(value_dims);
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
+        ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
+  }
+  slice_tensor.Resize(slice_dims);
+
+  // - Step 2.2 Pad slice tensor with 0
+  pad_e.device(eigen_place) = pad_e.constant(T(0));
+  pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
+      .device(eigen_place) = slice_e;
+
+  // Step 3: Set out tensor with value_tensor
+  out_e.device(eigen_place) = out_e - pad_e;
+}
+
+template <typename DeviceContext, typename T>
+void SetValueCompute_dispatch(
+    const framework::ExecutionContext& ctx, framework::Tensor* in,
+    framework::Tensor* value_tensor, framework::Tensor* out,
+    const std::vector<int64_t>& axes, std::vector<int64_t>* starts,
+    std::vector<int64_t>* ends, const std::vector<int64_t>& shape, int rank) {
+  switch (rank) {
+    case 1:
+      SetValueCompute<DeviceContext, T, 1>(ctx, in, value_tensor, out, axes,
+                                           starts, ends, shape);
+      break;
+    case 2:
+      SetValueCompute<DeviceContext, T, 2>(ctx, in, value_tensor, out, axes,
+                                           starts, ends, shape);
+      break;
+    case 3:
+      SetValueCompute<DeviceContext, T, 3>(ctx, in, value_tensor, out, axes,
+                                           starts, ends, shape);
+      break;
+    case 4:
+      SetValueCompute<DeviceContext, T, 4>(ctx, in, value_tensor, out, axes,
+                                           starts, ends, shape);
+      break;
+    case 5:
+      SetValueCompute<DeviceContext, T, 5>(ctx, in, value_tensor, out, axes,
+                                           starts, ends, shape);
+      break;
+    case 6:
+      SetValueCompute<DeviceContext, T, 6>(ctx, in, value_tensor, out, axes,
+                                           starts, ends, shape);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.", rank));
+  }
+}
+
+template <typename DeviceContext, typename T>
+void Tensor_Conj(const DeviceContext& dev_ctx, const framework::Tensor& tensor,
+                 framework::Tensor* out) {
+  out->Resize(tensor.dims());
+  platform::ForRange<DeviceContext> out_for_range(dev_ctx, tensor.numel());
+  math::ConjFunctor<T> out_functor(tensor.data<T>(), tensor.numel(),
+                                   out->mutable_data<T>(dev_ctx.GetPlace()));
+  out_for_range(out_functor);
+}
+
+template <typename DeviceContext, typename T>
+void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1,
+                const framework::Tensor& src2, framework::Tensor* out) {
+  out->Resize(src1.dims());
+  out->mutable_data<T>(dev_ctx.GetPlace());
+  auto pt_x = paddle::experimental::MakePtenDenseTensor(src1);
+  auto pt_y = paddle::experimental::MakePtenDenseTensor(src2);
+  auto pt_z = paddle::experimental::MakePtenDenseTensor(*out);
+  pten::AddKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), *pt_y.get(), -1,
+                                    pt_z.get());
+}
+
+template <typename DeviceContext, typename T>
+void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1,
+                const framework::Tensor& src2, framework::Tensor* out) {
+  out->Resize(src1.dims());
+  out->mutable_data<T>(dev_ctx.GetPlace());
+  auto pt_x = paddle::experimental::MakePtenDenseTensor(src1);
+  auto pt_y = paddle::experimental::MakePtenDenseTensor(src2);
+  auto pt_z = paddle::experimental::MakePtenDenseTensor(*out);
+  pten::SubtractKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), *pt_y.get(), -1,
+                                         pt_z.get());
+}
+
+template <typename DeviceContext, typename T, size_t D>
+void SliceCompute(const framework::ExecutionContext& ctx,
+                  const framework::Tensor* in, framework::Tensor* out,
+                  const std::vector<int>& axes_int,
+                  const std::vector<int>& starts_int,
+                  const std::vector<int>& ends_int) {
+  std::vector<int64_t> axes(axes_int.begin(), axes_int.end());
+  std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
+  std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
+
+  std::vector<int> decrease_axis = {};
+  std::vector<int> infer_flags = {};
+
+  PADDLE_ENFORCE_EQ(
+      starts.size(), axes.size(),
+      platform::errors::InvalidArgument(
+          "The size of starts must be equal to the size of axes."));
+  PADDLE_ENFORCE_EQ(ends.size(), axes.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of ends must be equal to the size of axes."));
+
+  // Step 2: Compute output
+
+  auto in_dims = in->dims();
+  auto out_dims = out->dims();
+  auto slice_dims = out_dims;
+
+  // 2.1 Infer output dims
+  for (size_t i = 0; i < axes.size(); ++i) {
+    // when start == -1 && end == start+1
+    if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+      auto ret = std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+      if (ret != decrease_axis.end()) {
+        ends[i] = in_dims[axes[i]];
+      }
+    }
+  }
+
+  CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+  slice_dims =
+      GetSliceDims<int64_t>(in_dims, axes, starts, ends, nullptr, nullptr);
+  out_dims = GetDecreasedDims(slice_dims, decrease_axis);
+
+  // 2.2 Get output
+  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
+
+  for (size_t i = 0; i < D; ++i) {
+    offsets[i] = 0;
+    extents[i] = slice_dims[i];
+  }
+  for (size_t i = 0; i < axes.size(); ++i) {
+    offsets[axes[i]] = starts[i];
+  }
+
+  out->Resize(slice_dims);
+  out->mutable_data<T>(ctx.GetPlace());
+
+  auto in_t = framework::EigenTensor<T, D>::From(*in, in_dims);
+  auto out_t = framework::EigenTensor<T, D>::From(*out, slice_dims);
+  auto& eigen_place =
+      *ctx.template device_context<DeviceContext>().eigen_device();
+
+  if (in->numel() <= Eigen::NumTraits<int>::highest()) {
+    // similar to tf.slice:
+    // if element number less than INT_MAX, change the type of index to int
+    Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+    for (size_t i = 0; i < D; i++) {
+      offsets_32bit[i] = offsets[i];
+      extents_32bit[i] = extents[i];
+    }
+    EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+        eigen_place, framework::To32BitIndex(out_t),
+        framework::To32BitIndex(in_t), offsets_32bit, extents_32bit);
+  } else {
+    EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+        eigen_place, out_t, in_t, offsets, extents);
+  }
+
+  out->Resize(out_dims);
+  out->mutable_data<T>(ctx.GetPlace());
+}
+
+template <typename DeviceContext, typename T>
+void Tensor_narrow(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* src, framework::Tensor* out,
+                   int row_s, int row_e, int col_s, int col_e) {
+  auto rank = src->dims().size();
+  std::vector<int> axes_int = {rank - 2, rank - 1};
+  std::vector<int> starts_int = {row_s, col_s};
+  std::vector<int> ends_int = {row_e, col_e};
+  switch (rank) {
+    case 1:
+      SliceCompute<DeviceContext, T, 1>(ctx, src, out, axes_int, starts_int,
+                                        ends_int);
+      break;
+    case 2:
+      SliceCompute<DeviceContext, T, 2>(ctx, src, out, axes_int, starts_int,
+                                        ends_int);
+      break;
+    case 3:
+      SliceCompute<DeviceContext, T, 3>(ctx, src, out, axes_int, starts_int,
+                                        ends_int);
+      break;
+    case 4:
+      SliceCompute<DeviceContext, T, 4>(ctx, src, out, axes_int, starts_int,
+                                        ends_int);
+      break;
+    case 5:
+      SliceCompute<DeviceContext, T, 5>(ctx, src, out, axes_int, starts_int,
+                                        ends_int);
+      break;
+    case 6:
+      SliceCompute<DeviceContext, T, 6>(ctx, src, out, axes_int, starts_int,
+                                        ends_int);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The rank of input should be less than 7, but received %d.", rank));
+  }
+}
+
+template <typename DeviceContext>
+void arange(const DeviceContext& dev_ctx, framework::Tensor* tmp, int w,
+            int batchsize = 1, int h = 1) {
+  tmp->Resize(framework::make_ddim({batchsize * w}));
+  platform::CPUPlace cpu;
+  auto tmpdata = tmp->mutable_data<int32_t>(cpu);
+  for (int b = 0; b < batchsize; b++) {
+    for (int i = 0; i < w; i++) {
+      tmpdata[b * w + i] = static_cast<int32_t>(b * h + i);
+    }
+  }
+}
+
+template <typename T>
+struct OneFunctor {
+  OneFunctor(T* output, int* idtptr, int w, int dim)
+      : output_(output), idtptr_(idtptr), w_(w), dim_(dim) {}
+
+  HOSTDEVICE void operator()(size_t idx) const {
+    output_[w_ * idtptr_[idx] + idx % dim_] = static_cast<T>(1);
+  }
+
+  T* output_;
+  int* idtptr_;
+  int w_;
+  int dim_;
+};
+
+template <typename DeviceContext, typename T>
+void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU,
+               framework::Tensor* L, framework::Tensor* U) {
+  const auto udims = LU->dims();
+  L->Resize(udims);
+  U->Resize(udims);
+  const auto H = udims[udims.size() - 2];
+  const auto W = udims[udims.size() - 1];
+  auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
+  platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
+  TrilTriuCompute<T> tril_computer(LU->data<T>(), -1, true, H, W, L_dataptr);
+  x_for_range(tril_computer);
+
+  TrilTriuCompute<T> triu_computer(LU->data<T>(), 0, false, H, W,
+                                   U->mutable_data<T>(dev_ctx.GetPlace()));
+  x_for_range(triu_computer);
+
+  // set L's diagonal 1
+  auto dim = std::min(H, W);
+  framework::Tensor rowtensor, rt_dev;
+  auto batchsize = product(framework::slice_ddim(udims, 0, udims.size() - 2));
+  batchsize = std::max(static_cast<int>(batchsize), 1);
+  arange<DeviceContext>(dev_ctx, &rowtensor, dim, batchsize, H);
+  auto idtptr = rowtensor.data<int32_t>();
+  if (is_gpu_place(dev_ctx.GetPlace())) {
+    framework::TensorCopy(rowtensor, dev_ctx.GetPlace(), &rt_dev);
+    idtptr = rt_dev.data<int32_t>();
+  }
+
+  platform::ForRange<DeviceContext> for_range(dev_ctx, rowtensor.numel());
+  OneFunctor<T> functor(L_dataptr, idtptr, W, dim);
+  for_range(functor);
+}
+
+template <typename DeviceContext, typename T>
+void scatterpivot(const DeviceContext& dev_ctx, T* out_data,
+                  framework::Tensor* idlst, int w, int dim) {
+  framework::Tensor idlst_tmp;
+  idlst_tmp.Resize(idlst->dims());
+  idlst_tmp.mutable_data<int32_t>(dev_ctx.GetPlace());
+  framework::TensorCopy(*idlst, dev_ctx.GetPlace(), &idlst_tmp);
+  auto idtptr = idlst_tmp.data<int32_t>();
+
+  platform::ForRange<DeviceContext> for_range(dev_ctx, idlst_tmp.numel());
+  OneFunctor<T> functor(out_data, idtptr, w, dim);
+  for_range(functor);
+}
+
+template <typename DeviceContext, typename T>
+void Unpack_Pivot(const DeviceContext& dev_ctx, const framework::Tensor& Pivot,
+                  framework::Tensor* P, int h, int w) {
+  auto dims = Pivot.dims();
+  auto Pdimvec = vectorize(dims);
+  auto prank = Pdimvec.size();
+  auto Pnum = dims[prank - 1];
+  framework::Tensor Pivot_cpu;
+  platform::CPUPlace cpu;
+  framework::TensorCopy(Pivot, cpu, &Pivot_cpu);
+  auto pdataptr = Pivot_cpu.data<int32_t>();
+  Pdimvec[prank - 1] = h;
+  Pdimvec.emplace_back(h);
+  auto Pdim = framework::make_ddim(Pdimvec);
+  P->Resize(Pdim);
+  auto pdata = P->mutable_data<T>(dev_ctx.GetPlace());
+  math::SetConstant<DeviceContext, T> setter;
+  setter(dev_ctx, P, static_cast<T>(0));
+
+  auto batchsize = product(framework::slice_ddim(dims, 0, prank - 1));
+  batchsize = std::max(static_cast<int>(batchsize), 1);
+  framework::Tensor idt;
+  for (int i = 0; i < batchsize; i++) {
+    arange<DeviceContext>(dev_ctx, &idt, h);
+    auto idlst = idt.data<int32_t>();
+    for (int j = 0; j < Pnum; j++) {
+      if (idlst[pdataptr[i * Pnum + j] - 1] == idlst[j]) continue;
+      auto temp = idlst[j];
+      idlst[j] = idlst[pdataptr[i * Pnum + j] - 1];
+      idlst[pdataptr[i * Pnum + j] - 1] = temp;
+    }
+    scatterpivot(dev_ctx, &(pdata[i * h * h]), &idt, h, h);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class LUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto xin = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Input<framework::Tensor>("Out");
+    auto P = ctx.Input<framework::Tensor>("Pivots");
+    auto dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+    auto xdims = xin->dims();
+    int xrank = xdims.size();
+    int64_t m = xdims[xrank - 2];
+    int64_t n = xdims[xrank - 1];
+    int64_t k = std::min(m, n);
+
+    framework::Tensor L, U, L_narrow, U_narrow, L_narrow_mH, U_narrow_mH,
+        grad_narrow;
+    LU_Unpack<DeviceContext, T>(dev_ctx, out, &L, &U);
+
+    Tensor_narrow<DeviceContext, T>(ctx, &L, &L_narrow, 0, k, 0, k);
+    Tensor_narrow<DeviceContext, T>(ctx, &U, &U_narrow, 0, k, 0, k);
+    Tensor_narrow<DeviceContext, T>(ctx, dout, &grad_narrow, 0, k, 0, k);
+    auto graddims = grad_narrow.dims();
+
+    Tensor_Conj<DeviceContext, T>(dev_ctx, L_narrow, &L_narrow_mH);
+    Tensor_Conj<DeviceContext, T>(dev_ctx, U_narrow, &U_narrow_mH);
+    L_narrow_mH = helper.Transpose(L_narrow_mH);
+    U_narrow_mH = helper.Transpose(U_narrow_mH);
+
+    auto LmHdims = L_narrow_mH.dims();
+    auto UmHdims = U_narrow_mH.dims();
+
+    framework::Tensor phi_L, phi_U, phi, psi;
+    phi_L.Resize(LmHdims);
+    phi_L.mutable_data<T>(ctx.GetPlace());
+    phi_U.Resize(UmHdims);
+    phi_U.mutable_data<T>(ctx.GetPlace());
+    auto mat_dim_l = math::CreateMatrixDescriptor(LmHdims, 0, false);
+    auto mat_dim_u = math::CreateMatrixDescriptor(UmHdims, 0, false);
+    auto mat_dim_g = math::CreateMatrixDescriptor(graddims, 0, false);
+    blas.MatMul(L_narrow_mH, mat_dim_l, grad_narrow, mat_dim_g,
+                static_cast<T>(1), &phi_L, static_cast<T>(0));
+
+    blas.MatMul(grad_narrow, mat_dim_g, U_narrow_mH, mat_dim_u,
+                static_cast<T>(1), &phi_U, static_cast<T>(0));
+
+    auto phil_rank = LmHdims.size();
+    auto phiu_rank = UmHdims.size();
+    platform::ForRange<DeviceContext> l_for_range(dev_ctx, phi_L.numel());
+    TrilTriuCompute<T> tril_computer(phi_L.data<T>(), -1, true,
+                                     LmHdims[phil_rank - 2],
+                                     LmHdims[phil_rank - 1], phi_L.data<T>());
+    l_for_range(tril_computer);
+
+    platform::ForRange<DeviceContext> u_for_range(dev_ctx, phi_U.numel());
+    TrilTriuCompute<T> triu_computer(phi_U.data<T>(), 0, false,
+                                     UmHdims[phiu_rank - 2],
+                                     UmHdims[phiu_rank - 1], phi_U.data<T>());
+    u_for_range(triu_computer);
+
+    Tensor_Add<DeviceContext, T>(dev_ctx, phi_L, phi_U, &phi);
+    psi.Resize(xdims);
+    psi.mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<DeviceContext, T> setter;
+    setter(dev_ctx, &psi, static_cast<T>(0));
+
+    std::vector<int64_t> axes = {xrank - 2, xrank - 1};
+    std::vector<int64_t> slice_starts(2, 0);
+    std::vector<int64_t> slice_ends(2, 0);
+    auto valuedims = vectorize(xdims);
+
+    framework::Tensor Pmat;
+    Unpack_Pivot<DeviceContext, T>(dev_ctx, *P, &Pmat, m, k);
+    if (m <= n) {
+      if (k < n) {
+        framework::Tensor U_complement, U_grad_complement, phi_complement,
+            phi_complement_l;
+        Tensor_narrow<DeviceContext, T>(ctx, &U, &U_complement, 0, k, k, n);
+        Tensor_narrow<DeviceContext, T>(ctx, dout, &U_grad_complement, 0, k, k,
+                                        n);
+        framework::Tensor U_complement_mH = helper.Transpose(U_complement);
+
+        Tensor_Conj<DeviceContext, T>(dev_ctx, U_complement_mH,
+                                      &U_complement_mH);
+
+        auto mat_dim_g =
+            math::CreateMatrixDescriptor(U_grad_complement.dims(), 0, false);
+        auto mat_dim_u =
+            math::CreateMatrixDescriptor(U_complement_mH.dims(), 0, false);
+        auto phidims = UmHdims;
+        phidims[UmHdims.size() - 2] = k;
+        phidims[UmHdims.size() - 1] = k;
+        phi_complement.Resize(phidims);
+        phi_complement.mutable_data<T>(ctx.GetPlace());
+        blas.MatMul(U_grad_complement, mat_dim_g, U_complement_mH, mat_dim_u,
+                    static_cast<T>(1), &phi_complement, static_cast<T>(0));
+
+        phi_complement_l.Resize(phidims);
+        phi_complement_l.mutable_data<T>(ctx.GetPlace());
+        const auto H = phidims[phidims.size() - 2];
+        const auto W = phidims[phidims.size() - 1];
+        platform::ForRange<DeviceContext> x_for_range(dev_ctx,
+                                                      phi_complement.numel());
+        TrilTriuCompute<T> tril_computer(phi_complement.data<T>(), -1, true, H,
+                                         W, phi_complement_l.data<T>());
+        x_for_range(tril_computer);
+
+        Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_l, &phi);
+
+        slice_starts[0] = 0;
+        slice_starts[1] = k;
+        slice_ends[0] = k;
+        slice_ends[1] = n;
+        valuedims[xrank - 2] = k;
+        valuedims[xrank - 1] = n - k;
+        SetValueCompute_dispatch<DeviceContext, T>(
+            ctx, &psi, &U_grad_complement, &psi, axes, &slice_starts,
+            &slice_ends, valuedims, xrank);
+      }
+
+      framework::Tensor psi_principal, phi_mH, psi_tmp;
+      Tensor_Conj<DeviceContext, T>(dev_ctx, phi, &phi_mH);
+      phi_mH = helper.Transpose(phi_mH);
+      triangular_solve<DeviceContext, T>(dev_ctx, U_narrow, phi_mH,
+                                         &psi_principal, true, false, false);
+
+      Tensor_Conj<DeviceContext, T>(dev_ctx, psi_principal, &psi_principal);
+      psi_principal = helper.Transpose(psi_principal);
+      slice_starts[0] = 0;
+      slice_starts[1] = 0;
+      slice_ends[0] = k;
+      slice_ends[1] = k;
+      valuedims[xrank - 2] = k;
+      valuedims[xrank - 1] = k;
+
+      SetValueCompute_dispatch<DeviceContext, T>(ctx, &psi, &psi_principal,
+                                                 &psi, axes, &slice_starts,
+                                                 &slice_ends, valuedims, xrank);
+      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, psi, &psi_tmp,
+                                         true, false, true);
+
+      auto mat_dim_p = math::CreateMatrixDescriptor(Pmat.dims(), 0, false);
+      auto mat_dim_b = math::CreateMatrixDescriptor(psi_tmp.dims(), 0, false);
+      blas.MatMul(Pmat, mat_dim_p, psi_tmp, mat_dim_b, static_cast<T>(1), dx,
+                  static_cast<T>(0));
+    } else {
+      framework::Tensor L_complement, L_grad_complement, phi_complement,
+          phi_complement_u;
+      Tensor_narrow<DeviceContext, T>(ctx, &L, &L_complement, k, m, 0, k);
+      Tensor_narrow<DeviceContext, T>(ctx, dout, &L_grad_complement, k, m, 0,
+                                      k);
+      framework::Tensor L_complement_mH = helper.Transpose(L_complement);
+      Tensor_Conj<DeviceContext, T>(dev_ctx, L_complement_mH, &L_complement_mH);
+
+      auto mat_dim_g =
+          math::CreateMatrixDescriptor(L_grad_complement.dims(), 0, false);
+      auto mat_dim_u =
+          math::CreateMatrixDescriptor(L_complement_mH.dims(), 0, false);
+      auto phidims = LmHdims;
+      phidims[LmHdims.size() - 2] = k;
+      phidims[LmHdims.size() - 1] = k;
+      phi_complement.Resize(phidims);
+      phi_complement.mutable_data<T>(ctx.GetPlace());
+      blas.MatMul(L_complement_mH, mat_dim_u, L_grad_complement, mat_dim_g,
+                  static_cast<T>(1), &phi_complement, static_cast<T>(0));
+
+      phi_complement_u.Resize(phidims);
+      phi_complement_u.mutable_data<T>(ctx.GetPlace());
+      const auto H = phidims[phidims.size() - 2];
+      const auto W = phidims[phidims.size() - 1];
+      platform::ForRange<DeviceContext> x_for_range(dev_ctx,
+                                                    phi_complement.numel());
+      TrilTriuCompute<T> triu_computer(phi_complement.data<T>(), 0, false, H, W,
+                                       phi_complement_u.data<T>());
+      x_for_range(triu_computer);
+
+      Tensor_Sub<DeviceContext, T>(dev_ctx, phi, phi_complement_u, &phi);
+
+      slice_starts[0] = k;
+      slice_starts[1] = 0;
+      slice_ends[0] = m;
+      slice_ends[1] = k;
+      valuedims[xrank - 2] = m - k;
+      valuedims[xrank - 1] = k;
+      SetValueCompute_dispatch<DeviceContext, T>(ctx, &psi, &L_grad_complement,
+                                                 &psi, axes, &slice_starts,
+                                                 &slice_ends, valuedims, xrank);
+      framework::Tensor psi_principal, phi_mH, psi_tmp, U_narrow_mH;
+      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, phi,
+                                         &psi_principal, true, false, true);
+      slice_starts[0] = 0;
+      slice_starts[1] = 0;
+      slice_ends[0] = k;
+      slice_ends[1] = k;
+      valuedims[xrank - 2] = k;
+      valuedims[xrank - 1] = k;
+
+      SetValueCompute_dispatch<DeviceContext, T>(ctx, &psi, &psi_principal,
+                                                 &psi, axes, &slice_starts,
+                                                 &slice_ends, valuedims, xrank);
+
+      psi_tmp.Resize(psi.dims());
+      psi_tmp.mutable_data<T>(ctx.GetPlace());
+      auto mat_dim_p = math::CreateMatrixDescriptor(Pmat.dims(), 0, false);
+      auto mat_dim_b = math::CreateMatrixDescriptor(psi.dims(), 0, false);
+      blas.MatMul(Pmat, mat_dim_p, psi, mat_dim_b, static_cast<T>(1), &psi_tmp,
+                  static_cast<T>(0));
+      psi_tmp = helper.Transpose(psi_tmp);
+
+      Tensor_Conj<DeviceContext, T>(dev_ctx, U_narrow, &U_narrow_mH);
+      triangular_solve<DeviceContext, T>(dev_ctx, U_narrow_mH, psi_tmp, &psi,
+                                         true, false, false);
+      *dx = helper.Transpose(psi);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc
new file mode 100644
index 0000000000000..e38a4703f64ee
--- /dev/null
+++ b/paddle/fluid/operators/lu_unpack_op.cc
@@ -0,0 +1,184 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lu_unpack_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LU_UnpackOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddComment(R"DOC(Unpack L U and P to single matrix tensor, 
+                unpack L and U matrix from LU, unpack permutation matrix Pmat from Pivtos .
+                )DOC");
+    AddInput("X", "(Tensor) The input LU tensor, shape of (*,m,n)");
+    AddInput("Pivots",
+             "(Tensor) The input Pivots tensor, shape of (*,min(m,n))");
+    AddOutput(
+        "Pmat",
+        "(Tensor) The output permutation matrix tensor, shape of (*, m, m)");
+    AddOutput("L", "(Tensor) The output lower triangular matrix tensor");
+    AddOutput("U", "(Tensor) The output upper triangular matrix tensor");
+    AddAttr<bool>("unpack_ludata", "Whether to unpack L and U")
+        .SetDefault(true);
+    AddAttr<bool>("unpack_pivots", "Whether to unpack permutation matrix")
+        .SetDefault(true);
+  }
+};
+
+class LU_UnpackOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "LU_Unpack");
+    OP_INOUT_CHECK(context->HasInput("Pivots"), "Input", "Pivots", "LU_Unpack");
+    OP_INOUT_CHECK(context->HasOutput("L"), "Output", "L", "LU_Unpack");
+    OP_INOUT_CHECK(context->HasOutput("U"), "Output", "U", "LU_Unpack");
+    OP_INOUT_CHECK(context->HasOutput("Pmat"), "Output", "Pmat", "LU_Unpack");
+    bool unpack_ludata = context->Attrs().Get<bool>("unpack_ludata");
+    bool unpack_pivots = context->Attrs().Get<bool>("unpack_pivots");
+
+    auto x_dims = context->GetInputDim("X");
+    int x_rank = x_dims.size();
+    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
+                                     "the rank of input must greater than 2"));
+
+    // context->SetOutputDim("Out", x_dims);
+    int m = x_dims[x_rank - 1];
+    int n = x_dims[x_rank - 2];
+    int min_mn = std::min(m, n);
+    if (unpack_ludata) {
+      auto ldims = x_dims;
+      auto udims = x_dims;
+      if (m >= n) {
+        udims[x_rank - 2] = min_mn;
+      } else {
+        ldims[x_rank - 1] = min_mn;
+      }
+      context->SetOutputDim("U", udims);
+      context->SetOutputDim("L", ldims);
+    }
+    if (unpack_pivots) {
+      auto pdims = x_dims;
+      pdims[x_rank - 1] = m;
+      context->SetOutputDim("Pmat", pdims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class LU_UnpackOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+
+    ctx->SetOutputType("L", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("L", data_type, framework::ALL_ELEMENTS);
+
+    ctx->SetOutputType("U", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("U", data_type, framework::ALL_ELEMENTS);
+
+    ctx->SetOutputType("Pmat", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Pmat", data_type, framework::ALL_ELEMENTS);
+  }
+};
+
+template <typename T>
+class LU_UnpackOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("lu_unpack_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Pivots", this->Input("Pivots"));
+    retv->SetInput("L", this->Output("L"));
+    retv->SetInput("U", this->Output("U"));
+    retv->SetInput("Pmat", this->Output("Pmat"));
+
+    retv->SetInput(framework::GradVarName("L"), this->OutputGrad("L"));
+    retv->SetInput(framework::GradVarName("U"), this->OutputGrad("U"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class LU_UnpackGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+
+    ctx->SetOutputType(framework::GradVarName("X"), var_type,
+                       framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType(framework::GradVarName("X"), data_type,
+                           framework::ALL_ELEMENTS);
+  }
+};
+
+class LU_UnpackGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lu_unpack");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("L")), "Input",
+                   "L@GRAD", "lu_unpack");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("U")), "Input",
+                   "U@GRAD", "lu_unpack");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(lu_unpack, ops::LU_UnpackOp, ops::LU_UnpackOpMaker,
+                  ops::LU_UnpackOpVarTypeInference,
+                  ops::LU_UnpackOpGradMaker<paddle::framework::OpDesc>,
+                  ops::LU_UnpackOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(lu_unpack_grad, ops::LU_UnpackGradOp,
+                  ops::LU_UnpackGradOpVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(lu_unpack,
+                       ops::LU_UnpackKernel<plat::CPUDeviceContext, float>,
+                       ops::LU_UnpackKernel<plat::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lu_unpack_grad, ops::LU_UnpackGradKernel<plat::CPUDeviceContext, float>,
+    ops::LU_UnpackGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lu_unpack_op.cu b/paddle/fluid/operators/lu_unpack_op.cu
new file mode 100644
index 0000000000000..c3247f38e12c2
--- /dev/null
+++ b/paddle/fluid/operators/lu_unpack_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/lu_unpack_op.h"
+
+namespace paddle {
+namespace operators {}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(lu_unpack,
+                        ops::LU_UnpackKernel<plat::CUDADeviceContext, float>,
+                        ops::LU_UnpackKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lu_unpack_grad, ops::LU_UnpackGradKernel<plat::CUDADeviceContext, float>,
+    ops::LU_UnpackGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h
new file mode 100644
index 0000000000000..115ab116fda1a
--- /dev/null
+++ b/paddle/fluid/operators/lu_unpack_op.h
@@ -0,0 +1,144 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/lu_op.h"
+#include "paddle/fluid/operators/tril_triu_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+template <typename DeviceContext, typename T>
+class LU_UnpackKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto xin = ctx.Input<framework::Tensor>("X");
+    auto P = ctx.Input<framework::Tensor>("Pivots");
+
+    auto ltensor = ctx.Output<framework::Tensor>("L");
+    auto utensor = ctx.Output<framework::Tensor>("U");
+    auto ptensor = ctx.Output<framework::Tensor>("Pmat");
+
+    auto unpack_ludata = ctx.Attr<bool>("unpack_ludata");
+    auto unpack_pivots = ctx.Attr<bool>("unpack_pivots");
+
+    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    auto xdims = xin->dims();
+    int xrank = xdims.size();
+    int64_t m = xdims[xrank - 2];
+    int64_t n = xdims[xrank - 1];
+    int64_t k = std::min(m, n);
+
+    if (unpack_ludata) {
+      ltensor->mutable_data<T>(ctx.GetPlace());
+      utensor->mutable_data<T>(ctx.GetPlace());
+
+      framework::Tensor L, U;
+      LU_Unpack<DeviceContext, T>(dev_ctx, xin, &L, &U);
+
+      if (m >= n) {
+        framework::TensorCopy(L, ctx.GetPlace(), ltensor);
+        Tensor_narrow<DeviceContext, T>(ctx, &U, utensor, 0, k, 0, k);
+      } else {
+        framework::TensorCopy(U, ctx.GetPlace(), utensor);
+        Tensor_narrow<DeviceContext, T>(ctx, &L, ltensor, 0, k, 0, k);
+      }
+    }
+
+    if (unpack_pivots) {
+      ptensor->mutable_data<T>(ctx.GetPlace());
+      Unpack_Pivot<DeviceContext, T>(dev_ctx, *P, ptensor, m, k);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LU_UnpackGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto dl = ctx.Input<framework::Tensor>(framework::GradVarName("L"));
+    auto du = ctx.Input<framework::Tensor>(framework::GradVarName("U"));
+    auto dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    framework::Tensor dl_tril, du_triu;
+    const auto ldims = dl->dims();
+    dl_tril.Resize(ldims);
+    auto H = ldims[ldims.size() - 2];
+    auto W = ldims[ldims.size() - 1];
+    auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
+    platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
+    TrilTriuCompute<T> tril_computer(dl->data<T>(), -1, true, H, W, L_dataptr);
+    l_for_range(tril_computer);
+
+    const auto udims = du->dims();
+    du_triu.Resize(udims);
+    H = udims[udims.size() - 2];
+    W = udims[udims.size() - 1];
+    auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
+    platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
+    TrilTriuCompute<T> triu_computer(du->data<T>(), 0, false, H, W, U_dataptr);
+    u_for_range(triu_computer);
+
+    auto xdims = dx->dims();
+    int xrank = xdims.size();
+    int64_t m = xdims[xrank - 2];
+    int64_t n = xdims[xrank - 1];
+    int64_t k = std::min(m, n);
+
+    std::vector<int64_t> axes = {xrank - 2, xrank - 1};
+    std::vector<int64_t> slice_starts(2, 0);
+    std::vector<int64_t> slice_ends(2, 0);
+    auto valuedims = vectorize(xdims);
+
+    math::SetConstant<DeviceContext, T> setter;
+    setter(dev_ctx, dx, static_cast<T>(0));
+    if (m <= n) {
+      slice_starts[0] = 0;
+      slice_starts[1] = 0;
+      slice_ends[0] = k;
+      slice_ends[1] = k;
+      valuedims[xrank - 2] = k;
+      valuedims[xrank - 1] = k;
+      SetValueCompute_dispatch<DeviceContext, T>(ctx, dx, &dl_tril, dx, axes,
+                                                 &slice_starts, &slice_ends,
+                                                 valuedims, xrank);
+
+      Tensor_Add<DeviceContext, T>(dev_ctx, *dx, du_triu, dx);
+    } else {
+      slice_starts[0] = 0;
+      slice_starts[1] = 0;
+      slice_ends[0] = k;
+      slice_ends[1] = k;
+      valuedims[xrank - 2] = k;
+      valuedims[xrank - 1] = k;
+      SetValueCompute_dispatch<DeviceContext, T>(ctx, dx, &du_triu, dx, axes,
+                                                 &slice_starts, &slice_ends,
+                                                 valuedims, xrank);
+
+      Tensor_Add<DeviceContext, T>(dev_ctx, *dx, dl_tril, dx);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index 35035704b7e07..e4fb4150f841b 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -24,6 +24,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/margin_cross_entropy_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/operators/masked_select_op_xpu.cc b/paddle/fluid/operators/masked_select_op_xpu.cc
index d86ad8f89b9fd..c575f133b1572 100644
--- a/paddle/fluid/operators/masked_select_op_xpu.cc
+++ b/paddle/fluid/operators/masked_select_op_xpu.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/masked_select_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -41,13 +42,8 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
     int* out_size = RAII_GUARD.alloc_l3_or_gm<int32_t>(1);
     int out_size_cpu;
 
-    int ret = xpu::nonzero_count(dev_ctx.x_context(), mask_data, out_size,
-                                 mask->numel());
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU nonzero_count kernel return wrong value[%d %s]",
-                          ret, XPUAPIErrorMsg[ret]));
-
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu::nonzero_count(
+        dev_ctx.x_context(), mask_data, out_size, mask->numel()));
     memory::Copy(platform::CPUPlace(), static_cast<void*>(&out_size_cpu),
                  BOOST_GET_CONST(platform::XPUPlace, mask->place()),
                  static_cast<void*>(out_size), sizeof(int32_t));
@@ -59,12 +55,9 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
     auto input_shape = framework::vectorize<int>(input_dim);
     auto mask_shape = framework::vectorize<int>(mask_dim);
 
-    ret = xpu::masked_select(dev_ctx.x_context(), input_data, mask_data,
-                             out_data, input_shape, mask_shape);
-    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU masked_select kernel return wrong value[%d %s]",
-                          ret, XPUAPIErrorMsg[ret]));
+    PADDLE_ENFORCE_XPU_SUCCESS(
+        xpu::masked_select(dev_ctx.x_context(), input_data, mask_data, out_data,
+                           input_shape, mask_shape, out_size_cpu));
   }
 };
 
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index a2f619d84a21e..fcf988efcd34c 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -111,6 +111,15 @@ if(WITH_ROCM)
     hip_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
+
+if(WITH_GPU AND (NOT WITH_ROCM))
+#currenty not yet support ROCM 
+#the generic conversion APIs of dense and sparse are only supported after cuda11.2
+    if((NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2)) 
+        cc_test(cusparse_conversion_api_test SRCS cusparse_conversion_api_test.cc DEPS tensor)
+    endif()
+endif()
+
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 if(WITH_TESTING AND TEST im2col_test)
     set_tests_properties(im2col_test PROPERTIES TIMEOUT 120)
diff --git a/paddle/fluid/operators/math/cusparse_conversion_api_test.cc b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
new file mode 100644
index 0000000000000..d45b57420eef1
--- /dev/null
+++ b/paddle/fluid/operators/math/cusparse_conversion_api_test.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/sparse.h"
+
+template <typename T>
+void TestNNZ(const std::vector<T>& dense_data, const int correct_nnz,
+             const int rows, const int cols) {
+  paddle::platform::CUDADeviceContext* context =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  auto sparse =
+      paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
+                                         T>(*context);
+
+  paddle::framework::Tensor dense, nnz_tensor;
+  auto dense_dims = paddle::framework::make_ddim({rows, cols});
+  auto nnz_dims = paddle::framework::make_ddim({dense_dims[0] + 1});
+  dense.mutable_data<T>(dense_dims, paddle::platform::CUDAPlace());
+  paddle::framework::TensorFromVector<T>(dense_data, *context, &dense);
+  int32_t* nnz_ptr =
+      nnz_tensor.mutable_data<int32_t>(nnz_dims, paddle::platform::CUDAPlace());
+  sparse.nnz(rows, cols, dense.data<T>(), nnz_ptr, nnz_ptr + 1);
+  std::vector<int32_t> nnz_vec(dense_dims[0] + 1);
+  paddle::framework::TensorToVector<int32_t>(nnz_tensor, *context, &nnz_vec);
+  delete context;
+  CHECK_EQ(correct_nnz, nnz_vec[0]);
+}
+
+TEST(sparse, nnz) {
+  std::vector<float> dense_data = {0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 3.2, 0.0, 0.0};
+  TestNNZ<float>(dense_data, 4, 3, 3);
+}
+
+TEST(sparse, nnz_double) {
+  std::vector<double> dense_data = {0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 3.2, 0.0};
+  TestNNZ<double>(dense_data, 4, 4, 2);
+}
+
+template <typename T>
+void TestDenseToSparse(const std::vector<T>& correct_dense_data,
+                       const std::vector<int64_t>& correct_rows,
+                       const std::vector<int64_t>& correct_cols,
+                       const std::vector<T>& correct_values,
+                       const int correct_nnz, const int rows, const int cols,
+                       const std::string& mode) {
+  paddle::platform::CUDADeviceContext* context =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+  // get sparse
+  auto sparse =
+      paddle::operators::math::GetSparse<paddle::platform::CUDADeviceContext,
+                                         T>(*context);
+
+  // create tensor and copy vector to tensor
+  paddle::framework::Tensor dense_tensor, rows_tensor, cols_tensor,
+      values_tensor, actual_dense_tensor;
+  auto dense_dims = paddle::framework::make_ddim({rows, cols});
+  T* dense_data =
+      dense_tensor.mutable_data<T>(dense_dims, paddle::platform::CUDAPlace());
+  T* actual_dense_data = actual_dense_tensor.mutable_data<T>(
+      dense_dims, paddle::platform::CUDAPlace());
+  paddle::framework::TensorFromVector<T>(correct_dense_data, *context,
+                                         &dense_tensor);
+
+  auto nnz_dims = paddle::framework::make_ddim({correct_nnz});
+  auto crows_dims = paddle::framework::make_ddim({rows + 1});
+  int64_t* rows_data = nullptr;
+  if (mode == "COO") {
+    rows_data = rows_tensor.mutable_data<int64_t>(
+        nnz_dims, paddle::platform::CUDAPlace());
+  } else {
+    rows_data = rows_tensor.mutable_data<int64_t>(
+        crows_dims, paddle::platform::CUDAPlace());
+  }
+  int64_t* cols_data = cols_tensor.mutable_data<int64_t>(
+      nnz_dims, paddle::platform::CUDAPlace());
+  T* values_data =
+      values_tensor.mutable_data<T>(nnz_dims, paddle::platform::CUDAPlace());
+
+  // test dense_to_sparse
+  if (mode == "COO") {
+    sparse.DenseToSparseCoo(rows, cols, dense_data, rows_data, cols_data,
+                            values_data);
+  } else {
+    sparse.DenseToSparseCsr(rows, cols, dense_data, rows_data, cols_data,
+                            values_data);
+  }
+
+  std::vector<int64_t> actual_rows(correct_nnz), actual_crows(rows + 1),
+      actual_cols(correct_nnz);
+  std::vector<T> actual_values(correct_nnz), actual_dense_vec(rows * cols);
+  if (mode == "COO") {
+    paddle::framework::TensorToVector<int64_t>(rows_tensor, *context,
+                                               &actual_rows);
+  } else {
+    paddle::framework::TensorToVector<int64_t>(rows_tensor, *context,
+                                               &actual_crows);
+  }
+  paddle::framework::TensorToVector<int64_t>(cols_tensor, *context,
+                                             &actual_cols);
+  paddle::framework::TensorToVector<T>(values_tensor, *context, &actual_values);
+
+  for (int i = 0; i < correct_nnz; i++) {
+    if (mode == "COO") {
+      CHECK_EQ(correct_rows[i], actual_rows[i]);
+    }
+    CHECK_EQ(correct_cols[i], actual_cols[i]);
+    CHECK_EQ(correct_values[i], actual_values[i]);
+  }
+  if (mode == "CSR") {
+    for (int i = 0; i < rows + 1; i++) {
+      CHECK_EQ(correct_rows[i], actual_crows[i]);
+    }
+  }
+
+  // test sparse_to_dense
+  if (mode == "COO") {
+    sparse.SparseCooToDense(rows, cols, correct_nnz, rows_data, cols_data,
+                            values_data, actual_dense_data);
+  } else {
+    sparse.SparseCsrToDense(rows, cols, correct_nnz, rows_data, cols_data,
+                            values_data, actual_dense_data);
+  }
+  paddle::framework::TensorToVector<T>(actual_dense_tensor, *context,
+                                       &actual_dense_vec);
+  for (uint64_t i = 0; i < correct_dense_data.size(); i++) {
+    CHECK_EQ(correct_dense_data[i], actual_dense_vec[i]);
+  }
+
+  delete context;
+}
+
+TEST(sparse, dense_to_sparse) {
+  std::vector<float> dense_data = {0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 3.2, 0.0, 0.0};
+  std::vector<float> values = {1.0, 2.0, 3.0, 3.2};
+  std::vector<int64_t> rows = {0, 1, 1, 2};
+  std::vector<int64_t> crows = {0, 1, 3, 4};
+  std::vector<int64_t> cols = {1, 0, 2, 0};
+  TestDenseToSparse<float>(dense_data, rows, cols, values, 4, 3, 3, "COO");
+  TestDenseToSparse<float>(dense_data, crows, cols, values, 4, 3, 3, "CSR");
+}
+
+TEST(sparse, dense_to_sparse_double) {
+  std::vector<double> dense_data = {0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 3.2, 0.0};
+  std::vector<double> values = {1.0, 2.0, 3.0, 3.2};
+  std::vector<int64_t> rows = {0, 1, 2, 3};
+  std::vector<int64_t> crows = {0, 1, 2, 3, 4};
+  std::vector<int64_t> cols = {1, 1, 1, 0};
+  TestDenseToSparse<double>(dense_data, rows, cols, values, 4, 4, 2, "COO");
+  TestDenseToSparse<double>(dense_data, crows, cols, values, 4, 4, 2, "CSR");
+}
+
+TEST(sparse, dense_to_sparse_fp16) {
+  using float16 = paddle::platform::float16;
+  std::vector<float16> dense_data = {float16(0.0), float16(1.0), float16(0.0),
+                                     float16(2.0), float16(0.0), float16(3.0),
+                                     float16(3.2), float16(0.0)};
+  std::vector<float16> values = {float16(1.0), float16(2.0), float16(3.0),
+                                 float16(3.2)};
+  std::vector<int64_t> rows = {0, 1, 2, 3};
+  std::vector<int64_t> crows = {0, 1, 2, 3, 4};
+  std::vector<int64_t> cols = {1, 1, 1, 0};
+  TestDenseToSparse<float16>(dense_data, rows, cols, values, 4, 4, 2, "COO");
+  TestDenseToSparse<float16>(dense_data, crows, cols, values, 4, 4, 2, "CSR");
+}
diff --git a/paddle/fluid/operators/math/lapack_function.cc b/paddle/fluid/operators/math/lapack_function.cc
index 3ce2225420e60..33fa2efb12c1b 100644
--- a/paddle/fluid/operators/math/lapack_function.cc
+++ b/paddle/fluid/operators/math/lapack_function.cc
@@ -125,6 +125,102 @@ void lapackEig<platform::complex<float>, float>(
       reinterpret_cast<std::complex<float> *>(work), &lwork, rwork, info);
 }
 
+template <>
+void lapackGels<double>(char trans, int m, int n, int nrhs, double *a, int lda,
+                        double *b, int ldb, double *work, int lwork,
+                        int *info) {
+  platform::dynload::dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work,
+                            &lwork, info);
+}
+
+template <>
+void lapackGels<float>(char trans, int m, int n, int nrhs, float *a, int lda,
+                       float *b, int ldb, float *work, int lwork, int *info) {
+  platform::dynload::sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work,
+                            &lwork, info);
+}
+
+template <>
+void lapackGelsd<double>(int m, int n, int nrhs, double *a, int lda, double *b,
+                         int ldb, double *s, double rcond, int *rank,
+                         double *work, int lwork, double *rwork, int *iwork,
+                         int *info) {
+  platform::dynload::dgelsd_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
+                             work, &lwork, iwork, info);
+}
+
+template <>
+void lapackGelsd<float>(int m, int n, int nrhs, float *a, int lda, float *b,
+                        int ldb, float *s, float rcond, int *rank, float *work,
+                        int lwork, float *rwork, int *iwork, int *info) {
+  platform::dynload::sgelsd_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
+                             work, &lwork, iwork, info);
+}
+
+template <>
+void lapackGelsy<double>(int m, int n, int nrhs, double *a, int lda, double *b,
+                         int ldb, int *jpvt, double rcond, int *rank,
+                         double *work, int lwork, double *rwork, int *info) {
+  platform::dynload::dgelsy_(&m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond,
+                             rank, work, &lwork, info);
+}
+
+template <>
+void lapackGelsy<float>(int m, int n, int nrhs, float *a, int lda, float *b,
+                        int ldb, int *jpvt, float rcond, int *rank, float *work,
+                        int lwork, float *rwork, int *info) {
+  platform::dynload::sgelsy_(&m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond,
+                             rank, work, &lwork, info);
+}
+
+template <>
+void lapackGelss<double>(int m, int n, int nrhs, double *a, int lda, double *b,
+                         int ldb, double *s, double rcond, int *rank,
+                         double *work, int lwork, double *rwork, int *info) {
+  platform::dynload::dgelss_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
+                             work, &lwork, info);
+}
+
+template <>
+void lapackGelss<float>(int m, int n, int nrhs, float *a, int lda, float *b,
+                        int ldb, float *s, float rcond, int *rank, float *work,
+                        int lwork, float *rwork, int *info) {
+  platform::dynload::sgelss_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
+                             work, &lwork, info);
+}
+
+template <>
+void lapackCholeskySolve<platform::complex<double>>(
+    char uplo, int n, int nrhs, platform::complex<double> *a, int lda,
+    platform::complex<double> *b, int ldb, int *info) {
+  platform::dynload::zpotrs_(
+      &uplo, &n, &nrhs, reinterpret_cast<std::complex<double> *>(a), &lda,
+      reinterpret_cast<std::complex<double> *>(b), &ldb, info);
+}
+
+template <>
+void lapackCholeskySolve<platform::complex<float>>(char uplo, int n, int nrhs,
+                                                   platform::complex<float> *a,
+                                                   int lda,
+                                                   platform::complex<float> *b,
+                                                   int ldb, int *info) {
+  platform::dynload::cpotrs_(
+      &uplo, &n, &nrhs, reinterpret_cast<std::complex<float> *>(a), &lda,
+      reinterpret_cast<std::complex<float> *>(b), &ldb, info);
+}
+
+template <>
+void lapackCholeskySolve<double>(char uplo, int n, int nrhs, double *a, int lda,
+                                 double *b, int ldb, int *info) {
+  platform::dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+}
+
+template <>
+void lapackCholeskySolve<float>(char uplo, int n, int nrhs, float *a, int lda,
+                                float *b, int ldb, int *info) {
+  platform::dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/lapack_function.h b/paddle/fluid/operators/math/lapack_function.h
index a4c2c865c859a..488b225ef570e 100644
--- a/paddle/fluid/operators/math/lapack_function.h
+++ b/paddle/fluid/operators/math/lapack_function.h
@@ -20,17 +20,46 @@ namespace math {
 
 // LU (for example)
 template <typename T>
-void lapackLu(int m, int n, T* a, int lda, int* ipiv, int* info);
+void lapackLu(int m, int n, T *a, int lda, int *ipiv, int *info);
 
+// Eigh
 template <typename T, typename ValueType = T>
-void lapackEigh(char jobz, char uplo, int n, T* a, int lda, ValueType* w,
-                T* work, int lwork, ValueType* rwork, int lrwork, int* iwork,
-                int liwork, int* info);
+void lapackEigh(char jobz, char uplo, int n, T *a, int lda, ValueType *w,
+                T *work, int lwork, ValueType *rwork, int lrwork, int *iwork,
+                int liwork, int *info);
 
+// Eig
 template <typename T1, typename T2 = T1>
-void lapackEig(char jobvl, char jobvr, int n, T1* a, int lda, T1* w, T1* vl,
-               int ldvl, T1* vr, int ldvr, T1* work, int lwork, T2* rwork,
-               int* info);
+void lapackEig(char jobvl, char jobvr, int n, T1 *a, int lda, T1 *w, T1 *vl,
+               int ldvl, T1 *vr, int ldvr, T1 *work, int lwork, T2 *rwork,
+               int *info);
+
+// Gels
+template <typename T>
+void lapackGels(char trans, int m, int n, int nrhs, T *a, int lda, T *b,
+                int ldb, T *work, int lwork, int *info);
+
+// Gelsd
+template <typename T1, typename T2>
+void lapackGelsd(int m, int n, int nrhs, T1 *a, int lda, T1 *b, int ldb, T2 *s,
+                 T2 rcond, int *rank, T1 *work, int lwork, T2 *rwork,
+                 int *iwork, int *info);
+
+// Gelsy
+template <typename T1, typename T2>
+void lapackGelsy(int m, int n, int nrhs, T1 *a, int lda, T1 *b, int ldb,
+                 int *jpvt, T2 rcond, int *rank, T1 *work, int lwork, T2 *rwork,
+                 int *info);
+
+// Gelss
+template <typename T1, typename T2>
+void lapackGelss(int m, int n, int nrhs, T1 *a, int lda, T1 *b, int ldb, T2 *s,
+                 T2 rcond, int *rank, T1 *work, int lwork, T2 *rwork,
+                 int *info);
+
+template <typename T>
+void lapackCholeskySolve(char uplo, int n, int nrhs, T *a, int lda, T *b,
+                         int ldb, int *info);
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
index 5deedf084c697..7d03f9590357e 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -48,7 +48,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
       memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
                    tmp_gpu_mat_data->ptr(),
                    boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   a.data<void>(), a.numel() * sizeof(T), context.stream());
+                   a.data(), a.numel() * sizeof(T), context.stream());
       gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
     }
 
diff --git a/paddle/fluid/operators/math/sparse.h b/paddle/fluid/operators/math/sparse.h
new file mode 100644
index 0000000000000..4ac68a3bdc4c6
--- /dev/null
+++ b/paddle/fluid/operators/math/sparse.h
@@ -0,0 +1,114 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext>
+class Sparse {
+ public:
+  explicit Sparse(const DeviceContext& context) : context_(context) {}
+
+  template <typename T>
+  void nnz(const int M, const int N, const T* dense, int* nnz,
+           int* nnzPerRowColumn) const;
+
+  template <typename T>
+  void DenseToSparseCoo(const int M, const int N, const T* dense, int64_t* rows,
+                        int64_t* cols, T* values) const;
+
+  template <typename T>
+  void DenseToSparseCsr(const int M, const int N, const T* dense,
+                        int64_t* crows, int64_t* cols, T* values) const;
+
+  template <typename T>
+  void SparseCooToDense(const int64_t M, const int64_t N, const int64_t nnz,
+                        const int64_t* rows, const int64_t* cols,
+                        const T* values, T* dense) const;
+  template <typename T>
+  void SparseCsrToDense(const int64_t M, const int64_t N, const int64_t nnz,
+                        const int64_t* crows, const int64_t* cols,
+                        const T* values, T* dense) const;
+
+ private:
+  const DeviceContext& context_;
+};
+
+template <typename DeviceContext, typename T>
+class SparseT : private Sparse<DeviceContext> {
+ public:
+  using Sparse<DeviceContext>::Sparse;
+
+  template <typename... ARGS>
+  void nnz(ARGS... args) const {
+    Base()->template nnz<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void DenseToSparseCoo(ARGS... args) const {
+    Base()->template DenseToSparseCoo<T>(args...);
+  }
+  template <typename... ARGS>
+  void DenseToSparseCsr(ARGS... args) const {
+    Base()->template DenseToSparseCsr<T>(args...);
+  }
+  template <typename... ARGS>
+  void SparseCooToDense(ARGS... args) const {
+    Base()->template SparseCooToDense<T>(args...);
+  }
+  template <typename... ARGS>
+  void SparseCsrToDense(ARGS... args) const {
+    Base()->template SparseCsrToDense<T>(args...);
+  }
+
+ private:
+  const Sparse<DeviceContext>* Base() const {
+    return static_cast<const Sparse<DeviceContext>*>(this);
+  }
+};
+
+template <typename DeviceContext, typename T>
+inline SparseT<DeviceContext, T> GetSparse(
+    const framework::ExecutionContext& exe_ctx) {
+  return SparseT<DeviceContext, T>(
+      exe_ctx.template device_context<DeviceContext>());
+}
+
+template <typename DeviceContext, typename T>
+inline SparseT<DeviceContext, T> GetSparse(const DeviceContext& dev_ctx) {
+  return SparseT<DeviceContext, T>(dev_ctx);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
+
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 11020
+#include "paddle/fluid/operators/math/sparse_impl.cu.h"
+#endif
+#endif
diff --git a/paddle/fluid/operators/math/sparse_impl.cu.h b/paddle/fluid/operators/math/sparse_impl.cu.h
new file mode 100644
index 0000000000000..8ff2f4b27df43
--- /dev/null
+++ b/paddle/fluid/operators/math/sparse_impl.cu.h
@@ -0,0 +1,231 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/cusparse.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+cudaDataType_t GetGpuDataType() {
+  if (std::is_same<T, float>::value) {
+    return CUDA_R_32F;
+  } else if (std::is_same<T, double>::value) {
+    return CUDA_R_64F;
+  } else if (std::is_same<T, platform::float16>::value) {
+    return CUDA_R_16F;
+  }
+}
+
+template <>
+template <typename T>
+void Sparse<platform::CUDADeviceContext>::nnz(const int M, const int N,
+                                              const T* dense, int* nnz,
+                                              int* nnzPerRowColumn) const {}
+
+template <>
+template <>
+void Sparse<platform::CUDADeviceContext>::nnz(const int M, const int N,
+                                              const float* dense, int* nnz,
+                                              int* nnzPerRowColumn) const {
+  cusparseMatDescr_t descr = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cusparseCreateMatDescr(&descr));
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cusparseSetMatType(
+      descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cusparseSetMatIndexBase(
+      descr, CUSPARSE_INDEX_BASE_ZERO));
+
+  context_.CusparseCall([&](cusparseHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cusparseSnnz(
+        handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dense, M, nnzPerRowColumn,
+        nnz));
+  });
+}
+
+template <>
+template <>
+void Sparse<platform::CUDADeviceContext>::nnz(const int M, const int N,
+                                              const double* dense, int* nnz,
+                                              int* nnzPerRowColumn) const {
+  cusparseMatDescr_t descr = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cusparseCreateMatDescr(&descr));
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cusparseSetMatType(
+      descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cusparseSetMatIndexBase(
+      descr, CUSPARSE_INDEX_BASE_ZERO));
+
+  context_.CusparseCall([&](cusparseHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cusparseDnnz(
+        handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dense, M, nnzPerRowColumn,
+        nnz));
+  });
+}
+
+template <typename T>
+inline void DenseToSparse(const platform::CUDADeviceContext& context,
+                          const int M, const int N, const T* dense,
+                          int64_t* rows, int64_t* cols, T* values,
+                          const cusparseFormat_t format) {
+  cusparseSpMatDescr_t matB;
+  cusparseDnMatDescr_t matA;
+
+  cudaDataType_t dtype = GetGpuDataType<T>();
+
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseCreateDnMat(
+      &matA, M, N, N, const_cast<void*>(reinterpret_cast<const void*>(dense)),
+      dtype, CUSPARSE_ORDER_ROW));
+
+  if (format == CUSPARSE_FORMAT_COO) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseCreateCoo(
+        &matB, M, N, 0, nullptr, nullptr, nullptr, CUSPARSE_INDEX_64I,
+        CUSPARSE_INDEX_BASE_ZERO, dtype));
+  } else if (format == CUSPARSE_FORMAT_CSR) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseCreateCsr(
+        &matB, M, N, 0, rows, nullptr, nullptr, CUSPARSE_INDEX_64I,
+        CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, dtype));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "the sparse format [%s] is not supported", format));
+  }
+
+  size_t buffer_size = 0;
+  context.CusparseCall([&](cusparseHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cusparseDenseToSparse_bufferSize(
+            handle, matA, matB, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT,
+            &buffer_size));
+  });
+  framework::Tensor buffer;
+  float* buffer_data = buffer.mutable_data<float>(
+      {static_cast<int64_t>(buffer_size)}, context.GetPlace());
+
+  context.CusparseCall([&](cusparseHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cusparseDenseToSparse_analysis(
+            handle, matA, matB, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT,
+            buffer_data));
+  });
+
+  if (format == CUSPARSE_FORMAT_COO) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseCooSetPointers(
+        matB, rows, cols, reinterpret_cast<void*>(values)));
+  } else if (format == CUSPARSE_FORMAT_CSR) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseCsrSetPointers(
+        matB, rows, cols, reinterpret_cast<void*>(values)));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "the sparse format [%s] is not supported", format));
+  }
+  context.CusparseCall([&](cusparseHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseDenseToSparse_convert(
+        handle, matA, matB, CUSPARSE_DENSETOSPARSE_ALG_DEFAULT, buffer_data));
+  });
+}
+template <>
+template <typename T>
+void Sparse<platform::CUDADeviceContext>::DenseToSparseCoo(
+    const int M, const int N, const T* dense, int64_t* rows, int64_t* cols,
+    T* values) const {
+  DenseToSparse<T>(context_, M, N, dense, rows, cols, values,
+                   CUSPARSE_FORMAT_COO);
+}
+
+template <>
+template <typename T>
+void Sparse<platform::CUDADeviceContext>::DenseToSparseCsr(
+    const int M, const int N, const T* dense, int64_t* crows, int64_t* cols,
+    T* values) const {
+  DenseToSparse<T>(context_, M, N, dense, crows, cols, values,
+                   CUSPARSE_FORMAT_CSR);
+}
+
+template <typename T>
+void SparseToDense(const platform::CUDADeviceContext& context, const int64_t M,
+                   const int64_t N, const int64_t nnz, const int64_t* rows,
+                   const int64_t* cols, const T* values, T* dense,
+                   const cusparseFormat_t format) {
+  cusparseSpMatDescr_t matA;
+  cusparseDnMatDescr_t matB;
+
+  cudaDataType_t dtype = GetGpuDataType<T>();
+  if (format == CUSPARSE_FORMAT_COO) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseCreateCoo(
+        &matA, M, N, nnz,
+        const_cast<void*>(reinterpret_cast<const void*>(rows)),
+        const_cast<void*>(reinterpret_cast<const void*>(cols)),
+        const_cast<void*>(reinterpret_cast<const void*>(values)),
+        CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO, dtype));
+  } else if (format == CUSPARSE_FORMAT_CSR) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseCreateCsr(
+        &matA, M, N, nnz,
+        const_cast<void*>(reinterpret_cast<const void*>(rows)),
+        const_cast<void*>(reinterpret_cast<const void*>(cols)),
+        const_cast<void*>(reinterpret_cast<const void*>(values)),
+        CUSPARSE_INDEX_64I, CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+        dtype));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "the sparse format [%s] is not supported", format));
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseCreateDnMat(
+      &matB, M, N, N, reinterpret_cast<void*>(dense), dtype,
+      CUSPARSE_ORDER_ROW));
+
+  size_t buffer_size = 0;
+  context.CusparseCall([&](cusparseHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::cusparseSparseToDense_bufferSize(
+            handle, matA, matB, CUSPARSE_SPARSETODENSE_ALG_DEFAULT,
+            &buffer_size));
+  });
+  framework::Tensor buffer;
+  float* buffer_data = buffer.mutable_data<float>(
+      {static_cast<int64_t>(buffer_size)}, context.GetPlace());
+
+  context.CusparseCall([&](cusparseHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusparseSparseToDense(
+        handle, matA, matB, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, buffer_data));
+  });
+}
+
+template <>
+template <typename T>
+void Sparse<platform::CUDADeviceContext>::SparseCooToDense(
+    const int64_t M, const int64_t N, const int64_t nnz, const int64_t* rows,
+    const int64_t* cols, const T* values, T* dense) const {
+  SparseToDense<T>(context_, M, N, nnz, rows, cols, values, dense,
+                   CUSPARSE_FORMAT_COO);
+}
+
+template <>
+template <typename T>
+void Sparse<platform::CUDADeviceContext>::SparseCsrToDense(
+    const int64_t M, const int64_t N, const int64_t nnz, const int64_t* crows,
+    const int64_t* cols, const T* values, T* dense) const {
+  SparseToDense<T>(context_, M, N, nnz, crows, cols, values, dense,
+                   CUSPARSE_FORMAT_CSR);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
index bcb2b92780cc8..69fd2dbb85246 100644
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -96,10 +96,101 @@ class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
+
+template <typename T>
+class Unpool3dMaxFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    int input_feasize = input_depth * input_height * input_width;
+    int output_feasize = output_depth * output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor depth * output tensor "
+                  "height "
+                  "* output tensor width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    int input_feasize = input_depth * input_height * input_width;
+    int output_feasize = output_depth * output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor depth * output tensor "
+                  "height "
+                  "* output tensor width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+
 template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
 template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
 template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
 template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
+template class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, float>;
+template class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, double>;
+template class Unpool3dMaxFunctor<platform::CPUDeviceContext, float>;
+template class Unpool3dMaxFunctor<platform::CPUDeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index dbb3d64350cae..973865caba688 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -51,6 +51,45 @@ __global__ void KernelUnpool2dMaxGrad(
 /*
  * All tensors are in NCHW format.
  */
+
+template <typename T>
+__global__ void KernelUnpool3dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_depth, const int input_height,
+                                  const int input_width, const int channels,
+                                  T* output_data, const int output_depth,
+                                  const int output_height,
+                                  const int output_width) {
+  CUDA_KERNEL_LOOP(linearIndex, nthreads) {
+    int c = (linearIndex / input_depth / input_width / input_height) % channels;
+    int n = linearIndex / input_depth / input_width / input_height / channels;
+    output_data +=
+        (n * channels + c) * output_depth * output_height * output_width;
+    int maxind = indices_data[linearIndex];
+    output_data[maxind] = input_data[linearIndex];
+  }
+}
+
+template <typename T>
+__global__ void KernelUnpool3dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_depth, const int input_height, const int input_width,
+    const int channels, const T* output_data, const T* output_grad,
+    const int output_depth, const int output_height, const int output_width,
+    T* input_grad) {
+  CUDA_KERNEL_LOOP(linearIndex, nthreads) {
+    int c = (linearIndex / input_depth / input_width / input_height) % channels;
+    int n = linearIndex / input_depth / input_width / input_height / channels;
+    output_grad +=
+        (n * channels + c) * output_depth * output_height * output_width;
+    int maxind = indices_data[linearIndex];
+    input_grad[linearIndex] = output_grad[maxind];
+  }
+}
+/*
+ * All tensors are in NCDHW format.
+ */
+
 template <typename T>
 class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
  public:
@@ -112,10 +151,82 @@ class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
         output_width, input_grad_data);
   }
 };
+
+template <typename T>
+class Unpool3dMaxFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+#ifdef __HIPCC__
+    int threads = 256;
+#else
+    int threads = 1024;
+#endif
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool3dMax<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_depth, input_height,
+        input_width, output_channels, output_data, output_depth, output_height,
+        output_width);
+  }
+};
+/*
+ * All tensors are in NCDHW format.
+ */
+template <typename T>
+class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+#ifdef __HIPCC__
+    int threads = 256;
+#else
+    int threads = 1024;
+#endif
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool3dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_depth, input_height,
+        input_width, output_channels, output_data, output_grad_data,
+        output_depth, output_height, output_width, input_grad_data);
+  }
+};
+
 template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
 template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
 template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
 template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
+template class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, float>;
+template class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, double>;
+template class Unpool3dMaxFunctor<platform::CUDADeviceContext, float>;
+template class Unpool3dMaxFunctor<platform::CUDADeviceContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.h b/paddle/fluid/operators/math/unpooling.h
index 74ca39d114e26..63bd8186adeb2 100644
--- a/paddle/fluid/operators/math/unpooling.h
+++ b/paddle/fluid/operators/math/unpooling.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -33,6 +33,22 @@ class Unpool2dMaxGradFunctor {
                   const framework::Tensor& output_grad,
                   framework::Tensor* input_grad);
 };
+
+template <typename DeviceContext, typename T>
+class Unpool3dMaxFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename DeviceContext, class T>
+class Unpool3dMaxGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 29e2cd08ce9fe..717c1b5c0ed15 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -695,9 +692,32 @@ class MatMulOp : public framework::OperatorWithKernel {
                                             "received %d",
                                             reshape_out_size));
 
-      auto it = std::find(reshape_out.begin(), reshape_out.end(), -1);
+      // int num_negative = std::count(reshape_out.begin(), reshape_out.end(),
+      // -1);
+      // PADDLE_ENFORCE_LE(num_negative, 1,
+      //                   platform::errors::InvalidArgument(
+      //                       "The max number of -1 in fused_reshape_Out is 1 "
+      //                       "but received %d.",
+      //                       num_negative));
+
+      // auto it_zero = std::find(reshape_out.begin(), reshape_out.end(), 0);
+      // if (it_zero != reshape_out.end()) {
+      //   for (uint64_t i = 0; i < reshape_out.size(); i++) {
+      //     if (reshape_out[i] == 0) {
+      //       PADDLE_ENFORCE_LT(
+      //           i, ddim_out.size(),
+      //           platform::errors::InvalidArgument(
+      //               "The index of 0 in fused_reshape_Out ",
+      //               "should be less than output dim size, ",
+      //               "but the index is %d and output dim size is %d", i,
+      //               ddim_out.size()));
+      //       reshape_out[i] = ddim_out.at(i);
+      //     }
+      //   }
+      // }
 
       // if "-1" is present then one of reshape dims must be infered
+      auto it = std::find(reshape_out.begin(), reshape_out.end(), -1);
       if (it != reshape_out.end()) {
         int index = std::distance(reshape_out.begin(), it);
 
@@ -840,17 +860,13 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
 #endif
     AddComment(R"DOC(
 MatMul Operator.
-
-
 This operator is used to perform (batched) matrix multiplication
 over the last two dimensions of the input tensors `X` and `Y`.
-
 If a transpose flag is specified, the last two dimensions of the
 tensor are transposed. If the tensor is rank-1 of shape [D], then
 for `X` it is treated as [1, D] in nontransposed form and as [D, 1]
 in transposed form, whereas for `Y` it is the opposite: It is treated
 as [D, 1] in nontransposed form and as [1, D] in transposed form.
-
 Examples without transpose:
 - X: [K], Y: [K] => Out: [1]
 - X: [K], Y: [K, N] => Out: [N]
@@ -858,10 +874,8 @@ Examples without transpose:
 - X: [M, K], Y: [B, K, N] => Out: [B, M, N]
 - X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
 - X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N]
-
 Example of matrix multiplication with head_number of H
 - X: [B, M, K], Y: [B, K, N] => Out: [B, M, H * N]
-
 The behavior is designed to be similar to the `numpy.matmul` function.
 The differences are:
 - When the rank of the input data is less than or equal to 3, it
@@ -872,10 +886,8 @@ The differences are:
 - We add `head_number` attribute, which is used to multiple two matrixes head
   by head, and eventually concatenates the output of several (head_number)
   small matrixes multiplication.
-
 Both the input `X` and `Y` can carry the LoD (Level of Details) information,
 or not. But the output only shares the LoD information with input `X`.
-
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 24201b1ba84cc..5add86f5b3c74 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -194,9 +194,32 @@ class MatMulV2Op : public framework::OperatorWithKernel {
                                             "received %d",
                                             reshape_out_size));
 
-      auto it = std::find(reshape_out.begin(), reshape_out.end(), -1);
+      // int num_negative = std::count(reshape_out.begin(), reshape_out.end(),
+      // -1);
+      // PADDLE_ENFORCE_LE(num_negative, 1,
+      //                   platform::errors::InvalidArgument(
+      //                       "The max number of -1 in fused_reshape_Out is 1 "
+      //                       "but received %d.",
+      //                       num_negative));
+
+      // auto it_zero = std::find(reshape_out.begin(), reshape_out.end(), 0);
+      // if (it_zero != reshape_out.end()) {
+      //   for (uint64_t i = 0; i < reshape_out.size(); i++) {
+      //     if (reshape_out[i] == 0) {
+      //       PADDLE_ENFORCE_LT(
+      //           i, ddim_out.size(),
+      //           platform::errors::InvalidArgument(
+      //               "The index of 0 in fused_reshape_Out ",
+      //               "should be less than output dim size, ",
+      //               "but the index is %d and output dim size is %d", i,
+      //               ddim_out.size()));
+      //       reshape_out[i] = ddim_out.at(i);
+      //     }
+      //   }
+      // }
 
       // if "-1" is present then one of reshape dims must be infered
+      auto it = std::find(reshape_out.begin(), reshape_out.end(), -1);
       if (it != reshape_out.end()) {
         int index = std::distance(reshape_out.begin(), it);
 
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index fc0f1416cc138..b257f345eaf36 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/linalg.h"
+#include "paddle/pten/kernels/matmul_kernel.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
@@ -384,8 +384,8 @@ class MatMulV2Kernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*Out);
 
     // call new kernel
-    pten::Matmul<T>(dev_ctx, *pt_x.get(), *pt_y.get(), trans_x, trans_y,
-                    pt_out.get());
+    pten::MatmulKernel<T>(dev_ctx, *pt_x, *pt_y, trans_x, trans_y,
+                          pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h
index eefefea77bed4..efa8af8054fc8 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.h
+++ b/paddle/fluid/operators/memcpy_d2h_op.h
@@ -69,16 +69,24 @@ class MemcpyD2HFunctor {
   }
 
  private:
+  static constexpr size_t WAIT_THRESHOLD = 64 * 1024;
   void CopyLoDTensor(const framework::LoDTensor &src,
                      framework::LoDTensor &dst) const {  // NOLINT
     if (dst_place_type_ == 1) {
       framework::TensorCopy(src, platform::CUDAPinnedPlace(), dev_ctx_, &dst);
     } else if (dst_place_type_ == 0) {
-      framework::TensorCopySync(src, platform::CPUPlace(), &dst);
+      framework::TensorCopy(src, platform::CPUPlace(), dev_ctx_, &dst);
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
     }
+    // NOTE(Aurelius84): host <-> device memory copies of a memory block of 64
+    // KB or less are asynchronous. See
+    // https://forums.developer.nvidia.com/t/host-device-memory-copies-up-to-64-kb-are-asynchronous/17907
+    if (src.memory_size() <= WAIT_THRESHOLD) {
+      dev_ctx_.Wait();
+    }
+
     dst.set_lod(src.lod());
   }
 
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 43ac5984bc8c8..a19dc3367a14b 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace platform {
@@ -42,11 +43,15 @@ class MemcpyH2DFunctor {
   void operator()(const framework::LoDTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    out_tensor.mutable_data(
-        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()),
-        lod_tensor.type(),
-        static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream());
+    auto stream =
+        static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream();
+#else
+    auto stream = nullptr;
 #endif
+    out_tensor.mutable_data(
+        dev_ctx_.GetPlace(), lod_tensor.type(),
+        platform::Stream(reinterpret_cast<platform::StreamId>(stream)));
+
     if (dst_place_type_ == 0 || dst_place_type_ == 1) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);
diff --git a/paddle/fluid/operators/mish_op.cc b/paddle/fluid/operators/mish_op.cc
deleted file mode 100644
index ea754b5b1e941..0000000000000
--- a/paddle/fluid/operators/mish_op.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/mish_op.h"
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-class MishOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mish");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mish");
-
-    ctx->ShareDim("X", /*->*/ "Out");
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
-  }
-};
-
-class MishOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of Mish operator");
-    AddOutput("Out", "Output of Mish operator");
-    AddAttr<float>(
-        "threshold",
-        "Constant threshold of softplus in Mish operator. Approximate value "
-        "of softplus will be used if absolute value of input is greater than "
-        ":attr:`threshold`")
-        .SetDefault(20.f);
-    AddComment(R"DOC(
-Mish Activation Operator.
-
-..  math::
-    softplus = \begin{cases}
-            x, \text{if } x > \text{threshold} \\
-            e^{x}, \text{if } x < -\text{threshold} \\
-            \ln(1 + e^{x}),  \text{otherwise}
-          \end{cases}
-
-    out = x * \tanh(softplus)
-
-)DOC");
-  }
-};
-
-// The operator to calculate gradients of a prelu operator.
-class MishGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mish");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
-                   "Out@GRAD", "mish");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
-  }
-};
-
-template <typename T>
-class MishGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("mish_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(mish, ops::MishOp, ops::MishOpMaker,
-                  ops::MishGradOpMaker<paddle::framework::OpDesc>,
-                  ops::MishGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(mish_grad, ops::MishGradOp);
-REGISTER_OP_CPU_KERNEL(
-    mish, ops::MishFP32CPUKernel<paddle::platform::CPUDeviceContext>,
-    ops::MishCPUKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    mish_grad, ops::MishGradFP32CPUKernel<paddle::platform::CPUDeviceContext>,
-    ops::MishGradCPUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mish_op.cu b/paddle/fluid/operators/mish_op.cu
deleted file mode 100644
index 4ca07b650c80a..0000000000000
--- a/paddle/fluid/operators/mish_op.cu
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/mish_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeMishFw(const T* in, T* out, const int numel,
-                         const float threshold) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < numel; tid += stride) {
-    T x = in[tid];
-    T sp = CalcSoftplus<T>(x, threshold);
-    out[tid] = x * tanh(sp);
-  }
-}
-
-// expf instead of exp should be used for float type, complement
-// and register float kernel separatelly
-__global__ void KeMishFwFP32(const float* in, float* out, const int numel,
-                             const float threshold) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < numel; tid += stride) {
-    float x = in[tid];
-    float sp = CalcSoftplusFP32(x, threshold);
-    out[tid] = x * tanhf(sp);
-  }
-}
-
-template <typename T>
-__global__ void KeMishBw(const T* in, const T* dout, T* din, const int numel,
-                         const float threshold) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < numel; tid += stride) {
-    T x = in[tid];
-    T sp = CalcSoftplus<T>(x, threshold);
-    T tsp = tanh(sp);
-    T grad_sp = -expm1(-sp);
-    T grad_tsp = (static_cast<T>(1) - tsp * tsp) * grad_sp;
-    din[tid] = dout[tid] * (x * grad_tsp + tsp);
-  }
-}
-
-__global__ void KeMishBwFP32(const float* in, const float* dout, float* din,
-                             const int numel, const float threshold) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < numel; tid += stride) {
-    float x = in[tid];
-    float sp = CalcSoftplusFP32(x, threshold);
-    float tsp = tanhf(sp);
-    float grad_sp = -expm1f(-sp);
-    float grad_tsp = (static_cast<float>(1) - tsp * tsp) * grad_sp;
-    din[tid] = dout[tid] * (x * grad_tsp + tsp);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class MishCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    const float threshold = ctx.Attr<float>("threshold");
-
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    const int numel = x->numel();
-
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
-    KeMishFw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                  ctx.cuda_device_context().stream()>>>(x_data, out_data, numel,
-                                                        threshold);
-  }
-};
-
-template <typename DeviceContext>
-class MishFP32CUDAKernel : public framework::OpKernel<float> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    const float threshold = ctx.Attr<float>("threshold");
-
-    const float* x_data = x->data<float>();
-    float* out_data = out->mutable_data<float>(ctx.GetPlace());
-
-    const int numel = x->numel();
-
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
-    KeMishFwFP32<<<config.block_per_grid, config.thread_per_block, 0,
-                   ctx.cuda_device_context().stream()>>>(x_data, out_data,
-                                                         numel, threshold);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MishGradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto threshold = ctx.Attr<float>("threshold");
-
-    const T* x_data = x->data<T>();
-    const T* dout_data = dout->data<T>();
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-
-    const int numel = x->numel();
-
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
-    KeMishBw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                  ctx.cuda_device_context().stream()>>>(
-        x_data, dout_data, dx_data, numel, threshold);
-  }
-};
-
-template <typename DeviceContext>
-class MishGradFP32CUDAKernel : public framework::OpKernel<float> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto threshold = ctx.Attr<float>("threshold");
-
-    const float* x_data = x->data<float>();
-    const float* dout_data = dout->data<float>();
-    float* dx_data = dx->mutable_data<float>(ctx.GetPlace());
-
-    const int numel = x->numel();
-
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), numel);
-    KeMishBwFP32<<<config.block_per_grid, config.thread_per_block, 0,
-                   ctx.cuda_device_context().stream()>>>(
-        x_data, dout_data, dx_data, numel, threshold);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mish, ops::MishFP32CUDAKernel<paddle::platform::CUDADeviceContext>,
-    ops::MishCUDAKernel<paddle::platform::CUDADeviceContext, double>)
-REGISTER_OP_CUDA_KERNEL(
-    mish_grad, ops::MishGradFP32CUDAKernel<paddle::platform::CUDADeviceContext>,
-    ops::MishGradCUDAKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/mish_op.h b/paddle/fluid/operators/mish_op.h
deleted file mode 100644
index 86ccb57d929e5..0000000000000
--- a/paddle/fluid/operators/mish_op.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-HOSTDEVICE static T CalcSoftplus(T x, float threshold) {
-  if (threshold > 0 && x > threshold) {
-    return x;
-  } else if (threshold > 0 && x < -threshold) {
-    return exp(x);
-  } else {
-    return log1p(exp(x));
-  }
-}
-
-// expf instead of exp should be used for float type, complement
-// and register float kernel separatelly
-HOSTDEVICE static float CalcSoftplusFP32(float x, float threshold) {
-  if (threshold > 0 && x > threshold) {
-    return x;
-  } else if (threshold > 0 && x < -threshold) {
-    return expf(x);
-  } else {
-    return log1pf(expf(x));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class MishCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    const float threshold = ctx.Attr<float>("threshold");
-
-    const T* x_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    int numel = x->numel();
-    for (int i = 0; i < numel; i++) {
-      T x_d = x_data[i];
-      T sp = CalcSoftplus<T>(x_d, threshold);
-      out_data[i] = x_d * std::tanh(sp);
-    }
-  }
-};
-
-template <typename DeviceContext>
-class MishFP32CPUKernel : public framework::OpKernel<float> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    const float threshold = ctx.Attr<float>("threshold");
-
-    const float* x_data = x->data<float>();
-    float* out_data = out->mutable_data<float>(ctx.GetPlace());
-
-    int numel = x->numel();
-    for (int i = 0; i < numel; i++) {
-      float x_d = x_data[i];
-      float sp = CalcSoftplusFP32(x_d, threshold);
-      out_data[i] = x_d * std::tanh(sp);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MishGradCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto threshold = ctx.Attr<float>("threshold");
-
-    const T* x_data = x->data<T>();
-    const T* dout_data = dout->data<T>();
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-
-    int numel = x->numel();
-    for (int i = 0; i < numel; i++) {
-      T x_d = x_data[i];
-      T sp = CalcSoftplus<T>(x_d, threshold);
-      T tsp = std::tanh(sp);
-      T grad_sp = -std::expm1(-sp);
-      T grad_tsp = (static_cast<T>(1) - tsp * tsp) * grad_sp;
-      dx_data[i] = dout_data[i] * (x_d * grad_tsp + tsp);
-    }
-  }
-};
-
-template <typename DeviceContext>
-class MishGradFP32CPUKernel : public framework::OpKernel<float> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
-    auto threshold = ctx.Attr<float>("threshold");
-
-    const float* x_data = x->data<float>();
-    const float* dout_data = dout->data<float>();
-    float* dx_data = dx->mutable_data<float>(ctx.GetPlace());
-
-    int numel = x->numel();
-    for (int i = 0; i < numel; i++) {
-      float x_d = x_data[i];
-      float sp = CalcSoftplusFP32(x_d, threshold);
-      float tsp = std::tanh(sp);
-      float grad_sp = -std::expm1f(-sp);
-      float grad_tsp = (static_cast<float>(1) - tsp * tsp) * grad_sp;
-      dx_data[i] = dout_data[i] * (x_d * grad_tsp + tsp);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 9c5d03c17afba..8630515a9fdaf 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -83,9 +83,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
   const auto *x = ctx.Input<Tensor>("X");
-  auto *y = ctx.Output<Tensor>("Out");
+  auto *out = ctx.Output<Tensor>("Out");
 
-  bool is_inplaced = x->IsSharedBufferWith(*y);
+  bool is_inplaced = x->IsSharedBufferWith(*out);
 
   platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
                                                ctx.GetPlace(), x);
@@ -94,9 +94,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
   if (is_inplaced) {
     dst_memory_p = src_memory_p;
-    y->mutable_data<T>(ctx.GetPlace());
+    out->mutable_data<T>(ctx.GetPlace());
   } else {
-    dst_memory_p = handler.AcquireDstMemory(y);
+    dst_memory_p = handler.AcquireDstMemory(out);
   }
   auto activation_p = handler.AcquireForwardPrimitive();
 
@@ -105,8 +105,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
       astream, {{DNNL_ARG_FROM, *src_memory_p}, {DNNL_ARG_TO, *dst_memory_p}});
   astream.wait();
 
-  y->set_layout(DataLayout::kMKLDNN);
-  y->set_format(GetMKLDNNFormat(*dst_memory_p));
+  out->set_layout(DataLayout::kMKLDNN);
+  out->set_format(GetMKLDNNFormat(*dst_memory_p));
 }
 
 template <typename T>
@@ -116,15 +116,15 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
   const auto *x = ctx.Input<Tensor>("X");
-  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+  const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
   platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
-                                               ctx.GetPlace(), x, diff_y);
+                                               ctx.GetPlace(), x, dout);
 
   auto src_memory_p = handler.AcquireBackwardSrcMemory(x);
-  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y);
-  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(diff_x);
+  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
+  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
   auto activation_backward_p = handler.AcquireBackwardPrimitive();
 
   auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
@@ -134,8 +134,37 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
                                   {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
   astream.wait();
 
-  diff_x->set_layout(DataLayout::kMKLDNN);
-  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory_p));
+  dx->set_layout(DataLayout::kMKLDNN);
+  dx->set_format(GetMKLDNNFormat(*diff_src_memory_p));
+}
+
+template <typename T>
+void eltwise_grad_use_out(const framework::ExecutionContext &ctx,
+                          dnnl::algorithm algorithm) {
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  const auto *out = ctx.Input<Tensor>("Out");
+  const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+  platform::ActivationMKLDNNHandler<T> handler(algorithm, ctx, mkldnn_engine,
+                                               ctx.GetPlace(), out, dout);
+
+  auto dst_memory_p = handler.AcquireBackwardSrcMemory(out);
+  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
+  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
+  auto activation_backward_p = handler.AcquireBackwardPrimitive();
+
+  auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+  activation_backward_p->execute(astream,
+                                 {{DNNL_ARG_DST, *dst_memory_p},
+                                  {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
+                                  {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
+  astream.wait();
+
+  dx->set_layout(DataLayout::kMKLDNN);
+  dx->set_format(GetMKLDNNFormat(*diff_src_memory_p));
 }
 
 template <typename T, dnnl::algorithm algorithm>
@@ -152,6 +181,13 @@ struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T, dnnl::algorithm algorithm>
+struct MKLDNNActivationGradUseOutFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    eltwise_grad_use_out<T>(ctx, algorithm);
+  }
+};
+
 template <typename T>
 struct GeluMKLDNNFunctor : public BaseActivationFunctor<T> {
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -217,6 +253,9 @@ using AbsMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_abs>;
 template <typename T>
 using EluMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_elu>;
 
+template <typename T>
+using ExpMKLDNNFunctor = MKLDNNActivationFunc<T, dnnl::algorithm::eltwise_exp>;
+
 template <typename T>
 using ReluMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_relu>;
@@ -234,24 +273,29 @@ using HardSwishMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_hardswish>;
 
 template <typename T>
-using SigmoidMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_logistic>;
+using SigmoidMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_logistic_use_dst_for_bwd>;
 
 template <typename T>
-using TanhMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_tanh>;
+using TanhMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_tanh_use_dst_for_bwd>;
 
 template <typename T>
-using SqrtMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_sqrt>;
+using SqrtMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_sqrt_use_dst_for_bwd>;
 
 template <typename T>
 using AbsMKLDNNGradFunctor =
     MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_abs>;
 
 template <typename T>
-using EluMKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_elu>;
+using EluMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_elu_use_dst_for_bwd>;
+
+template <typename T>
+using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
+    T, dnnl::algorithm::eltwise_exp_use_dst_for_bwd>;
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -281,10 +325,10 @@ namespace ops = paddle::operators;
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);           \
   __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor);              \
   __macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
-  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor);                 \
-  __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor);                 \
+  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor);           \
   __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);                    \
-  __macro(elu, EluMKLDNNFunctor, EluMKLDNNGradFunctor);
+  __macro(elu, EluMKLDNNFunctor, EluMKLDNNGradUseOutFunctor);              \
+  __macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor,
@@ -292,7 +336,9 @@ REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor,
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
                                        GeluMKLDNNGradFunctor);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor,
-                                       SigmoidMKLDNNGradFunctor);
+                                       SigmoidMKLDNNGradUseOutFunctor);
+REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sqrt, SqrtMKLDNNFunctor,
+                                       SqrtMKLDNNGradUseOutFunctor);
 
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index d584da72393bc..1bde58f7c4edb 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -806,11 +806,10 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
         ctx.Attr<std::vector<float>>("Scale_weights");
     const bool is_multi_channel = scale_weights_data.size() > 1;
     const int& groups = ctx.Attr<int>("groups");
-    const bool& is_test = ctx.Attr<bool>("is_test");
     int mask_reorder =
         is_multi_channel ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0;
     auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
-        filter, groups, false, is_test, scale_weights_data, mask_reorder);
+        filter, groups, false, true, scale_weights_data, mask_reorder);
 
     std::shared_ptr<dnnl::memory> dst_memory_p;
     if (fuse_residual_conn) {
@@ -842,7 +841,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
       auto p_scales_tuple = handler.get_int8_bias_scales(ctx);
 
       auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
-          bias, is_test, std::get<1>(*p_scales_tuple),
+          bias, true, std::get<1>(*p_scales_tuple),
           std::get<0>(*p_scales_tuple));
       args.insert({DNNL_ARG_BIAS, *bias_memory_p});
     }
@@ -1013,6 +1012,41 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
                                     ops::kConvMKLDNNFP32,
                                     ops::ConvMKLDNNGradOpKernel<float, float>);
 
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, BF16,
+    ops::kConvMKLDNNFP32,
+    ops::ConvMKLDNNGradOpKernel<paddle::platform::bfloat16, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(depthwise_conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    depthwise_conv2d, MKLDNN, ::paddle::platform::CPUPlace, BF16,
+    ops::kConvMKLDNNFP32,
+    ops::ConvMKLDNNOpKernel<paddle::platform::bfloat16, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(depthwise_conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, U8,
+                                    ops::kConvMKLDNNINT8,
+                                    ops::ConvMKLDNNOpKernel<uint8_t, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(depthwise_conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, S8,
+                                    ops::kConvMKLDNNINT8,
+                                    ops::ConvMKLDNNOpKernel<int8_t, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(depthwise_conv2d_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNGradOpKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    depthwise_conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, BF16,
+    ops::kConvMKLDNNFP32,
+    ops::ConvMKLDNNGradOpKernel<paddle::platform::bfloat16, float>);
+
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
                                     ops::kConvMKLDNNFP32,
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index f567f4660534c..833535eb878e9 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -176,11 +176,15 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(nearest_interp, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::InterpolateMKLDNNKernel<float>);
+                   ops::InterpolateMKLDNNKernel<float>,
+                   ops::InterpolateMKLDNNKernel<int8_t>,
+                   ops::InterpolateMKLDNNKernel<uint8_t>);
 REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
 
 REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::InterpolateMKLDNNKernel<float>);
+                   ops::InterpolateMKLDNNKernel<float>,
+                   ops::InterpolateMKLDNNKernel<int8_t>,
+                   ops::InterpolateMKLDNNKernel<uint8_t>);
 REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index 5cb6ae34dcecf..a8d4b852ca3c2 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -295,7 +295,7 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
       x_bd_dims[x_bd_dims.size() - 2] = x_dims[0];
     } else {
       for (size_t i = 0; i < x_dims.size(); ++i) {
-        x_bd_dims[i] = x_dims[i];
+        x_bd_dims[x_bd_dims.size() - x_dims.size() + i] = x_dims[i];
       }
     }
     if (y_dims.size() == 1) {
@@ -305,21 +305,21 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
       y_bd_dims[y_bd_dims.size() - 2] = y_dims[0];
     } else {
       for (size_t i = 0; i < y_dims.size(); ++i) {
-        y_bd_dims[i] = y_dims[i];
+        y_bd_dims[y_bd_dims.size() - y_dims.size() + i] = y_dims[i];
       }
     }
 
-    if ((y_dims.size() == x_dims.size()) && y_dims.size() > 2 &&
-        !IsOutputFused(ctx)) {
-      for (size_t i = 0; i < x_dims.size() - 2; ++i) {
+    if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) {
+      for (size_t i = 0; i < x_bd_dims.size() - 2; ++i) {
         PADDLE_ENFORCE_EQ(
-            x_dims[i] == y_dims[i] || x_dims[i] == 1 || y_dims[i] == 1, true,
-            paddle::platform::errors::InvalidArgument(
-                "Tensor dimensions are incorrect for broadcasting."
-                "Dimensions in X and Y must be same or equal to 1, but "
-                "received x_dim[%d]=%d and y_dims[%d]= %d",
-                i, x_dims[i], i, y_dims[i]));
-        out_dims[i] = std::max(x_dims[i], y_dims[i]);
+            x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] == 1 ||
+                y_bd_dims[i] == 1,
+            true, paddle::platform::errors::InvalidArgument(
+                      "Tensor dimensions are incorrect for broadcasting."
+                      "Dimensions in X and Y must be same or equal to 1, but "
+                      "received x_dim[%d]=%d and y_dims[%d]= %d",
+                      i, x_bd_dims[i], i, y_bd_dims[i]));
+        out_dims[i] = std::max(x_bd_dims[i], y_bd_dims[i]);
       }
       out->Resize(make_ddim(out_dims));
     }
@@ -382,11 +382,11 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     dy_tmp->mutable_data<T>(ctx.GetPlace());
   }
 
-  void ReduceSumForMatmulGradOutput(const ExecutionContext& ctx,
-                                    const MKLDNNDeviceContext& dev_ctx,
-                                    const dnnl::engine onednn_engine,
-                                    const Tensor* dx_tmp, Tensor* dx,
-                                    std::vector<int64_t> dx_dims) const {
+  void ReduceSumForMatmulGradOutput(
+      const ExecutionContext& ctx, const MKLDNNDeviceContext& dev_ctx,
+      const dnnl::engine onednn_engine, const Tensor* dx_tmp, Tensor* dx,
+      std::vector<int64_t>& dx_dims,
+      const std::vector<int64_t>& squeezed_dims) const {
     paddle::platform::ReductionMKLDNNHandler<T> handler(
         dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine,
         ctx.GetPlace(), dx_tmp, dx, dx_dims);
@@ -402,6 +402,19 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     reduction_p->execute(astream, reduction_args);
     astream.wait();
+
+    dx->set_format(paddle::platform::GetMKLDNNFormat(
+        dst_memory_p->get_desc().reshape(squeezed_dims)));
+  }
+
+  std::vector<int64_t> ExtendDimsWithOnes(const std::vector<int64_t>& dims,
+                                          int new_size) const {
+    std::vector<int64_t> new_dims(new_size, 1);
+    for (size_t i = 0; i < dims.size(); ++i) {
+      new_dims[new_size - dims.size() + i] = dims[i];
+    }
+
+    return new_dims;
   }
 
   void RunKernel(const ExecutionContext& ctx) const {
@@ -440,8 +453,14 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     bool trans_y = ctx.Attr<bool>("trans_y");
     auto dout_dims = vectorize(dout->dims());
 
-    int ndims = std::max(x->dims().size(), y->dims().size());
-    ndims = std::max(ndims, 3);
+    size_t ndims = std::max(x->dims().size(), y->dims().size());
+    ndims = std::max<size_t>(ndims, 3);
+
+    if (x_dims.size() != ndims) {
+      x_dims = ExtendDimsWithOnes(x_dims, ndims);
+    } else if (y_dims.size() != ndims) {
+      y_dims = ExtendDimsWithOnes(y_dims, ndims);
+    }
 
     // in broadcasting scenario new memory is required because
     // reduce sum must be calculated upon broadcasted dims
@@ -481,21 +500,21 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     if (x_dims != dx_bd_dims) {
       ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dx_tmp, dx,
-                                   x_dims);
+                                   x_dims,
+                                   paddle::framework::vectorize(x->dims()));
     } else {
       *dx = std::move(dx_tmp);
     }
     if (y_dims != dy_bd_dims) {
       ReduceSumForMatmulGradOutput(ctx, dev_ctx, onednn_engine, &dy_tmp, dy,
-                                   y_dims);
+                                   y_dims,
+                                   paddle::framework::vectorize(y->dims()));
     } else {
       *dy = std::move(dy_tmp);
     }
 
-    dx->set_layout(paddle::framework::DataLayout::kMKLDNN);
-    dx->set_format(x->format());
-    dy->set_layout(paddle::framework::DataLayout::kMKLDNN);
-    dy->set_format(y->format());
+    dx->Resize(x->dims());
+    dy->Resize(y->dims());
   }
 
  private:
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 917692bfbd9d5..c877b7130c55c 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include <paddle/fluid/framework/data_type.h>
-#include <paddle/fluid/framework/operator.h>
-#include <map>
-#include <string>
-#include <vector>
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
@@ -57,7 +54,7 @@ class MLUCnnlTensorDescPool {
 
 static MLUCnnlTensorDescPool g_cnnl_tensor_desc_pool;
 
-MLUCnnlTensorDesc &MLUCnnlTensorDesc::operator=(MLUCnnlTensorDesc &&rhs) {
+MLUCnnlTensorDesc& MLUCnnlTensorDesc::operator=(MLUCnnlTensorDesc&& rhs) {
   if (raw_tensor_desc) {
     g_cnnl_tensor_desc_pool.Recycle(raw_tensor_desc);
   }
@@ -138,7 +135,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const int tensor_dim,
       cnnlSetTensorDescriptorPosition(raw_tensor_desc, position));
 }
 
-MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor,
+MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor,
                                      const cnnlTensorLayout_t layout,
                                      const cnnlDataType_t tensor_dtype) {
   auto dims = framework::vectorize<int>(tensor.dims());
@@ -156,7 +153,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor,
   }
 }
 
-MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor,
+MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor,
                                      cnnlTensorLayout_t layout,
                                      const cnnlDataType_t tensor_dtype,
                                      int position)
@@ -165,7 +162,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor,
       cnnlSetTensorDescriptorPosition(raw_tensor_desc, position));
 }
 
-MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor,
+MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor,
                                      cnnlTensorLayout_t layout,
                                      const cnnlDataType_t tensor_dtype,
                                      int position, float scale)
@@ -197,31 +194,2359 @@ MLUCnnlActivationDesc::~MLUCnnlActivationDesc() {
   }
 }
 
-/* static */ void MLUCnnl::Active(const platform::MLUDeviceContext &ctx,
+MLUCnnlPoolingDesc::MLUCnnlPoolingDesc(
+    const cnnlPoolingMode_t mode, const cnnlNanPropagation_t maxpooling_nan_opt,
+    int window_rows, int window_cols, int64_t pad_up, int64_t pad_down,
+    int64_t pad_left, int64_t pad_right, int row_stride, int col_stride) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreatePoolingDescriptor(&pooling_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetPooling2dDescriptor(
+      pooling_desc_, mode, maxpooling_nan_opt, window_rows, window_cols, pad_up,
+      pad_down, pad_left, pad_right, row_stride, col_stride));
+}
+
+MLUCnnlPoolingDesc::MLUCnnlPoolingDesc(
+    const cnnlPoolingMode_t mode, const cnnlNanPropagation_t maxpooling_nan_opt,
+    const int tensor_rank, const std::vector<int>& window,
+    const std::vector<int>& padding, const std::vector<int>& stride) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreatePoolingDescriptor(&pooling_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetPoolingNdDescriptor(
+      pooling_desc_, mode, maxpooling_nan_opt, tensor_rank, window.data(),
+      padding.data(), stride.data()));
+}
+
+const cnnlPoolingDescriptor_t MLUCnnlPoolingDesc::get() const {
+  return pooling_desc_;
+}
+
+MLUCnnlPoolingDesc::~MLUCnnlPoolingDesc() {
+  if (pooling_desc_) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyPoolingDescriptor(pooling_desc_));
+  }
+}
+
+MLUCnnlRandomGeneratorDesc::MLUCnnlRandomGeneratorDesc(const bool is_mlu200,
+                                                       const int seed) {
+  if (is_mlu200) {
+    PADDLE_ENFORCE_MLU_SUCCESS(
+        cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_FAST));
+  } else {
+    PADDLE_ENFORCE_MLU_SUCCESS(
+        cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_MTGP32));
+    PADDLE_ENFORCE_MLU_SUCCESS(
+        cnnlRandSetPseudoRandomGeneratorSeed(mlu_generator, seed));
+  }
+}
+
+const cnnlRandGenerator_t MLUCnnlRandomGeneratorDesc::get() const {
+  return mlu_generator;
+}
+
+MLUCnnlRandomGeneratorDesc::~MLUCnnlRandomGeneratorDesc() {
+  if (mlu_generator) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandDestroyGenerator(mlu_generator));
+  }
+}
+
+MLUCnnlNMSDesc::MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode,
+                               const float iou_threshold,
+                               const int max_output_size,
+                               const float confidence_threshold,
+                               const int input_layout) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateNmsDescriptor(&nms_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetNmsDescriptor_v2(nms_desc_, mode, iou_threshold, max_output_size,
+                              confidence_threshold, input_layout));
+}
+
+const cnnlNmsDescriptor_t MLUCnnlNMSDesc::get() const { return nms_desc_; }
+
+MLUCnnlNMSDesc::~MLUCnnlNMSDesc() {
+  if (nms_desc_) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyNmsDescriptor(nms_desc_));
+  }
+}
+
+MLUCnnlReduceDesc::MLUCnnlReduceDesc(const std::vector<int>& axis_vec,
+                                     const cnnlReduceOp_t reduce_op,
+                                     const cnnlDataType_t data_type,
+                                     const cnnlNanPropagation_t nan_propagation,
+                                     const cnnlReduceIndices_t reduce_indices,
+                                     const cnnlIndicesType_t indices_type) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateReduceDescriptor(&reduction_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetReduceDescriptor(
+      reduction_desc_, const_cast<int*>(axis_vec.data()), axis_vec.size(),
+      reduce_op, data_type, nan_propagation, reduce_indices, indices_type));
+}
+
+const cnnlReduceDescriptor_t MLUCnnlReduceDesc::get() const {
+  return reduction_desc_;
+}
+
+MLUCnnlReduceDesc::~MLUCnnlReduceDesc() {
+  if (reduction_desc_) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyReduceDescriptor(reduction_desc_));
+  }
+}
+
+MLUCnnlOpTensorDesc::MLUCnnlOpTensorDesc(
+    cnnlOpTensorDesc_t op_tensor_op, cnnlDataType_t op_tensor_comp_type,
+    cnnlNanPropagation_t op_tensor_nan_opt) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateOpTensorDescriptor(&op_tensor_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetOpTensorDescriptor(
+      op_tensor_desc_, op_tensor_op, op_tensor_comp_type, op_tensor_nan_opt));
+}
+
+const cnnlOpTensorDescriptor_t MLUCnnlOpTensorDesc::get() const {
+  return op_tensor_desc_;
+}
+
+MLUCnnlOpTensorDesc::~MLUCnnlOpTensorDesc() {
+  if (op_tensor_desc_) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyOpTensorDescriptor(op_tensor_desc_));
+  }
+}
+
+MLUCnnlConvolutionDesc::MLUCnnlConvolutionDesc(
+    const int dims, const int pad[], const int stride[], const int dilation[],
+    const int group_count, const cnnlDataType_t tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateConvolutionDescriptor(&conv_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetConvolutionDescriptor(
+      conv_desc_, dims, pad, stride, dilation, group_count, tensor_dtype));
+}
+
+MLUCnnlConvolutionDesc::MLUCnnlConvolutionDesc(
+    const int dims, const int64_t pad[], const int64_t stride[],
+    const int64_t dilation[], const int group_count,
+    const cnnlDataType_t tensor_dtype) {
+  const int spatial_dims = dims - 2;
+  const int pad_dims = spatial_dims * 2;
+  std::vector<int> pad_int32(pad_dims);
+  std::vector<int> stride_int32(spatial_dims);
+  std::vector<int> dilation_int32(spatial_dims);
+  std::vector<int64_t>::const_iterator int64_pad_cbegin(pad);
+  std::vector<int64_t>::const_iterator int64_pad_cend(pad + pad_dims);
+  std::vector<int64_t>::const_iterator int64_stride_cbegin(stride);
+  std::vector<int64_t>::const_iterator int64_stride_cend(stride + spatial_dims);
+  std::vector<int64_t>::const_iterator int64_dilation_cbegin(dilation);
+  std::vector<int64_t>::const_iterator int64_dilation_cend(dilation +
+                                                           spatial_dims);
+  std::transform(int64_pad_cbegin, int64_pad_cend, pad_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  std::transform(int64_stride_cbegin, int64_stride_cend, stride_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  std::transform(int64_dilation_cbegin, int64_dilation_cend,
+                 dilation_int32.begin(), &CheckedNarrowing<int64_t, int>);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateConvolutionDescriptor(&conv_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetConvolutionDescriptor(
+      conv_desc_, dims, pad_int32.data(), stride_int32.data(),
+      dilation_int32.data(), group_count, tensor_dtype));
+}
+
+const cnnlConvolutionDescriptor_t MLUCnnlConvolutionDesc::get() const {
+  return conv_desc_;
+}
+
+MLUCnnlConvolutionDesc::~MLUCnnlConvolutionDesc() {
+  if (conv_desc_) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyConvolutionDescriptor(conv_desc_));
+  }
+}
+
+MLUCnnlBatchSpaceDesc::MLUCnnlBatchSpaceDesc(uint32_t block_shape[],
+                                             uint32_t paddings[],
+                                             const uint32_t block_shape_size,
+                                             const uint32_t paddings_size) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateSpaceBatchNdDescriptor(&op_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetSpaceBatchNdDescriptor(
+      op_desc_, block_shape, block_shape_size, paddings, paddings_size));
+}
+
+void MLUCnnlBatchSpaceDesc::getSpace2batchNdextraInputSize(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSpace2batchNdExtraInputSize(
+      handle, input_desc, op_desc_, &extra_input_size_));
+}
+
+void MLUCnnlBatchSpaceDesc::getBatch2spaceNdextraInputSize(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetBatch2spaceNdExtraInputSize(
+      handle, input_desc, op_desc_, &extra_input_size_));
+}
+
+void MLUCnnlBatchSpaceDesc::initSpace2batchNdExtraInput(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
+    void* extra_host_input) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlInitSpace2batchNdExtraInput(
+      handle, input_desc, op_desc_, extra_host_input));
+}
+
+void MLUCnnlBatchSpaceDesc::initBatch2spaceNdExtraInput(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
+    void* extra_host_input) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlInitBatch2spaceNdExtraInput(
+      handle, input_desc, op_desc_, extra_host_input));
+}
+
+const cnnlSpaceBatchNdDescriptor_t MLUCnnlBatchSpaceDesc::get() const {
+  return op_desc_;
+}
+
+size_t MLUCnnlBatchSpaceDesc::getExtraInputSize() const {
+  return extra_input_size_;
+}
+
+MLUCnnlBatchSpaceDesc::~MLUCnnlBatchSpaceDesc() {
+  if (op_desc_) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroySpaceBatchNdDescriptor(op_desc_));
+  }
+}
+
+MLUCnnlTrigonDesc::MLUCnnlTrigonDesc(
+    const cnnlTrigonFunctionMode_t trigon_function_mode) {
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateTrigonDescriptor(&trigon_desc_));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTrigonDescriptor(trigon_desc_, trigon_function_mode));
+}
+
+const cnnlTrigonDescriptor_t MLUCnnlTrigonDesc::get() const {
+  return trigon_desc_;
+}
+
+MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
+  if (trigon_desc_) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyTrigonDescriptor(trigon_desc_));
+  }
+}
+
+/* static */ void MLUCnnl::Active(const ExecutionContext& ctx,
                                   cnnlActivationDescriptor_t active_desc,
                                   const cnnlTensorDescriptor_t input_desc,
-                                  const void *input,
+                                  const void* input,
                                   const cnnlTensorDescriptor_t output_desc,
-                                  void *output) {
-  cnnlHandle_t handle = ctx.cnnl_handle();
+                                  void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlActivationForward(
       handle, active_desc, NULL, input_desc, input, NULL, output_desc, output));
 }
 
 /* static */ void MLUCnnl::ActiveGrad(
-    const platform::MLUDeviceContext &ctx,
-    cnnlActivationDescriptor_t active_desc, const void *alpha, const void *beta,
-    const cnnlTensorDescriptor_t y_desc, const void *y,
-    const cnnlTensorDescriptor_t diff_y_desc, const void *diff_y,
-    const cnnlTensorDescriptor_t x_desc, const void *x,
-    const cnnlTensorDescriptor_t diff_x_desc, void *diff_x) {
-  cnnlHandle_t handle = ctx.cnnl_handle();
+    const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc,
+    const void* alpha, const void* beta, const cnnlTensorDescriptor_t y_desc,
+    const void* y, const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y,
+    const cnnlTensorDescriptor_t x_desc, const void* x,
+    const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   PADDLE_ENFORCE_MLU_SUCCESS(
       cnnlActivationBackward(handle, active_desc, alpha, y_desc, y, diff_y_desc,
                              diff_y, x_desc, x, beta, diff_x_desc, diff_x));
 }
 
+/* static */ void MLUCnnl::Concat(const ExecutionContext& ctx,
+                                  const int pack_num, const int axis,
+                                  const cnnlTensorDescriptor_t inputs_desc[],
+                                  const void* const inputs[],
+                                  const cnnlTensorDescriptor_t output_desc,
+                                  void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc,
+                                        inputs, workspace_ptr, workspace_size,
+                                        output_desc, output));
+}
+
+/* static */ void MLUCnnl::Div(
+    const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
+    const cnnlTensorDescriptor_t in0_desc, const void* in0,
+    const cnnlTensorDescriptor_t in1_desc, const void* in1,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetDivWorkspaceSize(
+      handle, in0_desc, in1_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlDiv_v2(handle, prefer, in0_desc, in0, in1_desc,
+                                        in1, workspace_ptr, workspace_size,
+                                        output_desc, output));
+}
+
+/* static */ void MLUCnnl::Fill(const ExecutionContext& ctx, float value,
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlFill(handle, value, output_desc, output));
+}
+
+/* static */ void MLUCnnl::QuantifyOffline(
+    const ExecutionContext& ctx, cnnlQuantizeMode_t mode,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlQuantizeV1(handle, mode, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::LRN(const ExecutionContext& ctx,
+                               const int local_size, const double alpha,
+                               const double beta, const double k,
+                               const cnnlTensorDescriptor_t input_quant_desc,
+                               const void* input_quant,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetLrnWorkspaceSize(
+      handle, input_quant_desc, output_desc, local_size, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  const cnnlLrnMode_t mode = CNNL_LRN_CROSS_CHANNEL;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlLrn(
+      handle, mode, local_size, alpha, beta, k, workspace_ptr, workspace_size,
+      input_quant_desc, const_cast<void*>(input_quant), output_desc, output));
+}
+
+/* static */ void MLUCnnl::QuantifyOnline(
+    const ExecutionContext& ctx, const int bitwidth,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const bool compute_scale, void* position, void* scale,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetQuantizeParamWorkspaceSize(handle, input_desc, &workspace_size));
+
+  // use ctx allocate interface for profiling purpose
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  const cnnlQuantizeMode_t mode =
+      compute_scale ? CNNL_QUANTIZE_POSITION_SCALE : CNNL_QUANTIZE_POSITION;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeParam(
+      handle, mode, input_desc, input, bitwidth, workspace_ptr, workspace_size,
+      position, scale, nullptr));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeV2(handle, mode, input_desc, input,
+                                            position, scale, nullptr,
+                                            output_desc, output));
+}
+
+/* static */ void MLUCnnl::Range(const ExecutionContext& ctx, const void* start,
+                                 const void* end, const void* step,
+                                 const cnnlDataType_t output_dtype,
+                                 void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlArange(handle, start, end, step, output_dtype, output));
+}
+
+/* static */ void MLUCnnl::Round(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRound(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::SparseSoftmaxXentWithLogits(
+    const ExecutionContext& ctx, cnnlSoftmaxMode_t mode,
+    const cnnlTensorDescriptor_t x_desc, const void* input,
+    const cnnlTensorDescriptor_t label_desc, const void* label,
+    const cnnlTensorDescriptor_t y_desc, void* output,
+    const cnnlTensorDescriptor_t diff_y_desc, void* back_out) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSparseSoftmaxCrossEntropyWithLogits(
+      handle, mode, x_desc, input, label_desc, label, y_desc, output,
+      diff_y_desc, back_out));
+}
+
+/* static */ void MLUCnnl::Cumsum(const ExecutionContext& ctx, const int axis,
+                                  const bool exclusive, const bool reverse,
+                                  const cnnlTensorDescriptor_t input_desc,
+                                  const void* input,
+                                  const cnnlTensorDescriptor_t ouput_desc,
+                                  void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  // NAN propagation mode: Only support CNNL_NOT_PROPAGATE_NAN now.
+  cnnlNanPropagation_t mode = CNNL_NOT_PROPAGATE_NAN;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCumsum(handle, input_desc, input, axis,
+                                        exclusive, reverse, mode, ouput_desc,
+                                        output));
+}
+
+/* static */ void MLUCnnl::BroadcastTo(const ExecutionContext& ctx,
+                                       const cnnlTensorDescriptor_t input_desc,
+                                       const void* input,
+                                       const cnnlTensorDescriptor_t output_desc,
+                                       void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlExpand(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::AssignAdd(const ExecutionContext& ctx,
+                                     const void* alpha, const void* beta,
+                                     const cnnlTensorDescriptor_t update_desc,
+                                     const void* update,
+                                     const cnnlTensorDescriptor_t param_desc,
+                                     void* param) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignAdd(
+      handle, alpha, update_desc, update, nullptr, 0, beta, param_desc, param));
+}
+
+/* static */ void MLUCnnl::AssignSub(const ExecutionContext& ctx,
+                                     const void* alpha, const void* beta,
+                                     const cnnlTensorDescriptor_t update_desc,
+                                     const void* update,
+                                     const cnnlTensorDescriptor_t param_desc,
+                                     void* param) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignSub(
+      handle, alpha, update_desc, update, nullptr, 0, beta, param_desc, param));
+}
+
+/* static */ void MLUCnnl::Assign(const ExecutionContext& ctx,
+                                  const cnnlTensorDescriptor_t update_desc,
+                                  const void* update,
+                                  const cnnlTensorDescriptor_t param_desc,
+                                  void* param) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlCopy(handle, update_desc, update, param_desc, param));
+}
+
+/* static */ void MLUCnnl::SGD(const ExecutionContext& ctx,
+                               const cnnlTensorDescriptor_t grad_desc,
+                               const void* grad, const void* lr,
+                               const cnnlTensorDescriptor_t var_desc,
+                               void* var) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGradientDescent(handle, grad_desc, grad, lr, var_desc, var));
+}
+
+/* static */ void MLUCnnl::ApplyAdaGrad(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
+    const void* grad, const cnnlTensorDescriptor_t accum_desc, void* accum,
+    const cnnlTensorDescriptor_t var_desc, void* var, const void* lr,
+    const bool update_slots) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyAdaGrad(handle, grad_desc, grad,
+                                              accum_desc, accum, var_desc, var,
+                                              lr, update_slots));
+}
+
+/* static */ void MLUCnnl::ApplyRMSProp(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
+    const void* grad, const void* lr, const void* rho, const void* momentum,
+    const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var,
+    const cnnlTensorDescriptor_t ms_desc, void* ms,
+    const cnnlTensorDescriptor_t mom_desc, void* mom) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlRMSProp(handle, lr, rho, epsilon, momentum,
+                                         grad_desc, grad, var_desc, var,
+                                         ms_desc, ms, mom_desc, mom));
+}
+
+/* static */ void MLUCnnl::ApplyCenterRMSProp(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
+    const void* grad, const void* lr, const void* rho, const void* momentum,
+    const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var,
+    const cnnlTensorDescriptor_t mg_desc, void* mg,
+    const cnnlTensorDescriptor_t ms_desc, void* ms,
+    const cnnlTensorDescriptor_t mom_desc, void* mom) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyCenterRMSProp(
+      handle, var_desc, var, mg_desc, mg, ms_desc, ms, mom_desc, mom, grad_desc,
+      grad, lr, rho, momentum, epsilon));
+}
+
+/* static */ void MLUCnnl::ApplyAdam(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
+    const void* grad, const void* lr, const void* beta1, const void* beta2,
+    const void* beta1_power, const void* beta2_power, const void* epsilon,
+    const bool use_nesterov, const cnnlTensorDescriptor_t var_desc, void* var,
+    const cnnlTensorDescriptor_t m_desc, void* m,
+    const cnnlTensorDescriptor_t v_desc, void* v) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyAdam(
+      handle, grad_desc, var, grad_desc, m, grad_desc, v, grad_desc, grad, lr,
+      beta1, beta2, beta1_power, beta2_power, epsilon, use_nesterov));
+}
+
+/* static */ void MLUCnnl::ApplyAdaMax(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
+    const cnnlTensorDescriptor_t var_desc, void* var,
+    const cnnlTensorDescriptor_t m_desc, void* m,
+    const cnnlTensorDescriptor_t v_desc, void* v, const void* diff,
+    const void* lr, const void* beta1, const void* beta2,
+    const void* beta1_power, const void* epsilon) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlApplyAdaMax(handle, var_desc, var, m_desc, m, v_desc, v, grad_desc,
+                      diff, lr, beta1, beta2, beta1_power, epsilon));
+}
+
+/* static */ void MLUCnnl::ApplyMomentum(const ExecutionContext& ctx,
+                                         const cnnlTensorDescriptor_t grad_desc,
+                                         const void* grad,
+                                         const bool use_nesterov,
+                                         const void* lr, const void* momentum,
+                                         void* var, void* accum) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlMomentum(handle, grad_desc, var, grad_desc,
+                                          accum, grad_desc, grad, lr, momentum,
+                                          use_nesterov));
+}
+
+/* static */ void MLUCnnl::ApplyKerasMomentum(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
+    const void* grad, const bool use_nesterov, const void* lr,
+    const void* momentum, void* var, void* accum) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlKerasMomentum(handle, grad_desc, var, grad_desc, accum, grad_desc,
+                        grad, lr, momentum, use_nesterov));
+}
+
+/* static */ void MLUCnnl::ApplyAdadelta(const ExecutionContext& ctx,
+                                         const cnnlTensorDescriptor_t grad_desc,
+                                         const void* diff, const void* lr,
+                                         const void* rho, const void* epsilon,
+                                         void* var, void* accum,
+                                         void* accum_update) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlApplyAdadelta(handle, grad_desc, var, grad_desc, accum, grad_desc,
+                        accum_update, grad_desc, diff, lr, rho, epsilon));
+}
+
+/* static */ void MLUCnnl::Scale(
+    const ExecutionContext& ctx, const int axis,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
+    const cnnlTensorDescriptor_t beta_desc, const void* beta,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlScale(handle, axis, input_desc, input,
+                                       alpha_desc, alpha, beta_desc, beta,
+                                       output_desc, output));
+}
+
+/* static */ void MLUCnnl::AddN(const ExecutionContext& ctx, uint32_t input_num,
+                                const cnnlTensorDescriptor_t inputs_desc[],
+                                const void* inputs[],
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlAddN(handle, inputs_desc, inputs, input_num, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Log(const ExecutionContext& ctx,
+                               cnnlComputationPreference_t prefer,
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  cnnlLogBase_t log_base = CNNL_LOG_E;
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlLog_v2(handle, prefer, log_base, input_desc,
+                                        input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Matmul(
+    const ExecutionContext& ctx, const bool transpose_a, const bool transpose_b,
+    const cnnlTensorDescriptor_t in0_desc, const void* in0,
+    const cnnlTensorDescriptor_t in1_desc, const void* in1,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  float alpha = 1.0f;
+  float beta = 0.0f;
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlMatMul(handle, transpose_a, transpose_b,
+                 reinterpret_cast<void*>(&alpha), in0_desc, in0, in1_desc, in1,
+                 reinterpret_cast<void*>(&beta), output_desc, output));
+}
+
+/* static */ void MLUCnnl::BatchMatmul(
+    const ExecutionContext& ctx, const bool transpose_a, const bool transpose_b,
+    const cnnlTensorDescriptor_t in0_desc, const void* in0,
+    const cnnlTensorDescriptor_t in1_desc, const void* in1,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetBatchMatMulBCastWorkspaceSize(
+      handle, in0_desc, in1_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCast(
+      handle, transpose_a, transpose_b, in0_desc, in0, in1_desc, in1,
+      workspace_ptr, workspace_size, output_desc, output));
+}
+
+/* static */ void MLUCnnl::OpTensor(
+    const ExecutionContext& ctx, const cnnlOpTensorDescriptor_t op_tensor_desc,
+    const cnnlTensorDescriptor_t a_desc, const void* a,
+    const cnnlTensorDescriptor_t b_desc, const void* b,
+    const cnnlTensorDescriptor_t output_desc, void* output,
+    const cnnlDataType_t dtype) {
+  static const int alpha1_int = 1, alpha2_int = 1, beta_int = 0;
+  static const float alpha1_float = 1.f, alpha2_float = 1.f, beta_float = 0.f;
+
+  const void* alpha1_ptr = static_cast<const void*>(&alpha1_float);
+  const void* alpha2_ptr = static_cast<const void*>(&alpha2_float);
+  const void* beta_ptr = static_cast<const void*>(&beta_float);
+
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+
+  bool is_dt_float = (dtype == CNNL_DTYPE_FLOAT || dtype == CNNL_DTYPE_HALF);
+
+  //  if datatype is not float, we set alpha and beta to be int
+  if (!is_dt_float) {
+    alpha1_ptr = static_cast<const void*>(&alpha1_int);
+    alpha2_ptr = static_cast<const void*>(&alpha2_int);
+    beta_ptr = static_cast<const void*>(&beta_int);
+  }
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetOpTensorWorkspaceSize_v2(
+      handle, op_tensor_desc, alpha1_ptr, a_desc, a, alpha2_ptr, b_desc, b,
+      beta_ptr, output_desc, output, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlOpTensor(
+      handle, op_tensor_desc, alpha1_ptr, a_desc, a, alpha2_ptr, b_desc, b,
+      workspace_ptr, workspace_size, beta_ptr, output_desc, output));
+}
+
+/* static */ void MLUCnnl::BiasAddGrad(
+    const ExecutionContext& ctx, const int axis,
+    const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBiasAddBackward(
+      handle, out_backprop_desc, out_backprop, axis, output_desc, output));
+}
+
+/* static */ void MLUCnnl::RandomUniform(
+    const ExecutionContext& ctx, const int num, const cnnlDataType_t data_type,
+    const cnnlRandGenerator_t mlu_generator, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandGenerateUniform(
+      handle, mlu_generator, data_type, nullptr, num, 0, 1, output));
+}
+
+/* static */ void MLUCnnl::TopK(
+    const ExecutionContext& ctx, const int k, const int dim, const bool largest,
+    const bool sorted, const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t values_output_desc,
+    void* values_out, const cnnlTensorDescriptor_t indices_output_desc,
+    void* indices_out) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTopKTensor(
+      handle, input_desc, input, k, dim, largest, sorted, values_output_desc,
+      values_out, indices_output_desc, indices_out));
+}
+
+/* static */ void MLUCnnl::StridedSlice(
+    const ExecutionContext& ctx, const int begin[], const int end[],
+    const int strides[], const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlStridedSlice(
+      handle, input_desc, input, begin, end, strides, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Split(const ExecutionContext& ctx, int split_num,
+                                 int axis,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input_ptr,
+                                 const cnnlTensorDescriptor_t output_descs[],
+                                 void* output_ptrs[]) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc,
+                                       input_ptr, workspace_ptr, workspace_size,
+                                       output_descs, output_ptrs));
+}
+
+/* static */ void MLUCnnl::GatherFunctor(
+    const ExecutionContext& ctx, const int axis, const int batch_dims,
+    const cnnlTensorDescriptor_t params_desc, const void* params,
+    const cnnlTensorDescriptor_t indices_desc, const void* indices,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlBatchGatherV2(handle, axis, batch_dims, params_desc, params,
+                        indices_desc, indices, output_desc, output));
+}
+
+/* static */ void MLUCnnl::ScatterFunctor(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc,
+    const void* params, const cnnlTensorDescriptor_t updates_desc,
+    const void* updates, const cnnlTensorDescriptor_t indices_desc,
+    const void* indices, const cnnlScatterRefMode_t mode) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlScatterRef(handle, params_desc, params,
+                                            indices_desc, indices, updates_desc,
+                                            updates, 0, mode));
+}
+
+/* static */ void MLUCnnl::StridedSliceGrad(
+    const ExecutionContext& ctx, const int begin[], const int end[],
+    const int strides[], const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlStridedSliceBackward(
+      handle, begin, end, strides, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Logic(
+    const ExecutionContext& ctx, const MLULogicMethod log_method,
+    const cnnlTensorDescriptor_t input1_desc, const void* input1,
+    const cnnlTensorDescriptor_t input2_desc, const void* input2,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetLogicOpWorkspaceSize(
+      handle, input1_desc, input2_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlLogicOp(
+      handle, cnnlLogicOp_t(log_method), input1_desc, input1, input2_desc,
+      input2, workspace_ptr, workspace_size, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Select(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t then_desc,
+    const void* p_then, const cnnlTensorDescriptor_t else_desc,
+    const void* p_else, const cnnlTensorDescriptor_t output_desc, void* output,
+    const bool* condition, const int condition_size) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSelect(handle, then_desc, p_then, else_desc,
+                                        p_else, output_desc, output, condition,
+                                        condition_size));
+}
+
+/*static */ void MLUCnnl::GatherNd(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t params_desc,
+                                   const void* params,
+                                   const cnnlTensorDescriptor_t indices_desc,
+                                   const void* indices,
+                                   const cnnlTensorDescriptor_t output_desc,
+                                   void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGatherNd(
+      handle, params_desc, params, indices_desc, indices, output_desc, output));
+}
+
+/* static */ void MLUCnnl::BatchToSpace(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t output_desc, void* output,
+    const cnnlSpaceBatchParam_t param) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetBatch2spaceWorkspaceSize(
+      handle, input_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatch2space(handle, input_desc, input,
+                                             output_desc, output, param,
+                                             workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::BatchToSpaceNd(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
+    const void* input, cnnlSpaceBatchNdDescriptor_t param,
+    void* extra_device_input, size_t extra_input_size,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlBatch2spaceNd_v2(handle, input_desc, input, output_desc, output,
+                           param, extra_device_input, extra_input_size));
+}
+
+/* static */ void MLUCnnl::SoftmaxForward(
+    const ExecutionContext& ctx, cnnlSoftmaxAlgorithm_t algorithm,
+    cnnlSoftmaxMode_t mode, const void* alpha,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const void* beta, const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSoftmaxForward(handle, algorithm, mode, alpha,
+                                                input_desc, input, beta,
+                                                output_desc, output));
+}
+
+/* static */ void MLUCnnl::Softplus(const ExecutionContext& ctx,
+                                    const cnnlTensorDescriptor_t features_desc,
+                                    const void* features,
+                                    const cnnlTensorDescriptor_t output_desc,
+                                    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  const int beta = 1;
+  const int threshold = 20;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSoftplusForward(
+      handle, features_desc, features, output_desc, output, beta, threshold));
+}
+
+/* static */ void MLUCnnl::SoftplusGrad(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t gradients_desc,
+    const void* gradients, const cnnlTensorDescriptor_t features_desc,
+    const void* features, const cnnlTensorDescriptor_t output_desc,
+    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  int beta = 1;
+  int threshold = 20;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSoftplusBackward(handle, features_desc, features, gradients_desc,
+                           gradients, output_desc, output, beta, threshold));
+}
+
+/* static */ void MLUCnnl::PoolingForward(
+    const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
+    const std::vector<int64_t>& output_shape,
+    const cnnlPoolingDescriptor_t pooling_desc, const void* alpha,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const void* beta, const void* extra_input_ptr,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetPoolingWorkspaceSize(
+      handle, pool_mode, output_shape[2], output_shape[1], &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlPoolingForward_v2(
+      handle, pooling_desc, alpha, input_desc, input, beta, extra_input_ptr,
+      output_desc, output, workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::Pool3D(
+    const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
+    const std::vector<int64_t>& output_shape,
+    const cnnlPoolingDescriptor_t pooling_desc, const void* alpha,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const void* beta, const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetPoolingWorkspaceSize(
+      handle, pool_mode, output_shape[2], output_shape[1], &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlPoolingForward(handle, pooling_desc, alpha, input_desc, input, beta,
+                         output_desc, output, workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::RsqrtGrad(const ExecutionContext& ctx,
+                                     const cnnlTensorDescriptor_t data_desc,
+                                     const void* y, const void* diff_y,
+                                     void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRsqrtBackward(handle, data_desc, y, diff_y, output));
+}
+
+/* static */ void MLUCnnl::SqrtGrad(const ExecutionContext& ctx,
+                                    const cnnlTensorDescriptor_t data_desc,
+                                    const void* y, const void* diff_y,
+                                    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSqrtBackward(handle, data_desc, y, diff_y, output));
+}
+
+/* static */ void MLUCnnl::UnsortedSegmentSum(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t data_desc,
+    const void* data, const cnnlTensorDescriptor_t ids_desc,
+    const int* segment_ids, const cnnlTensorDescriptor_t output_desc,
+    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetUnsortedSegmentSumWorkspaceSize(
+      handle, data_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlUnsortedSegmentSum(
+      handle, data_desc, data, ids_desc, segment_ids, workspace_ptr,
+      workspace_size, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Pad(const ExecutionContext& ctx,
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input, const void* paddings,
+                               const void* padding_value,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlPad(handle, input_desc, input, paddings,
+                                     padding_value, output_desc, output));
+}
+
+/* static */ void MLUCnnl::OneHot(const ExecutionContext& ctx,
+                                  const cnnlTensorDescriptor_t desc_indices,
+                                  const void* indices, const int depth,
+                                  const void* on_value, const void* off_value,
+                                  const int axis,
+                                  cnnlDataType_t output_data_type,
+                                  void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlOneHot(handle, desc_indices, indices, depth,
+                                        on_value, off_value, axis,
+                                        output_data_type, output));
+}
+
+/* static */ void MLUCnnl::ConvolutionForward(
+    const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc,
+    const void* alpha, const void* beta, const cnnlTensorDescriptor_t bias_desc,
+    const void* bias_ptr, const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t filtet_desc,
+    const void* filter, const cnnlTensorDescriptor_t output_desc,
+    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  // cnnl: select best algorithm for convolution compution.
+  cnnlConvolutionForwardAlgo_t algo;
+  cnnlConvolutionFwdPreference_t preference = CNNL_CONVOLUTION_FWD_FASTEST;
+  cnnlGetConvolutionForwardAlgorithm(handle, conv_desc, input_desc, filtet_desc,
+                                     output_desc, preference, &algo);
+
+  // get workspace size
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionForwardWorkspaceSize(
+      handle, input_desc, filtet_desc, output_desc, bias_desc, conv_desc, algo,
+      &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlConvolutionForward(
+      handle, conv_desc, algo, alpha, input_desc, input, filtet_desc, filter,
+      bias_desc, bias_ptr, workspace_ptr, workspace_size, beta, output_desc,
+      output));
+}
+
+/* static */ void MLUCnnl::Tile(const ExecutionContext& ctx,
+                                const cnnlTensorDescriptor_t input_desc,
+                                const void* input,
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlTile(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::SoftmaxCrossEntropyWithLogits(
+    const ExecutionContext& ctx, cnnlSoftmaxMode_t mode,
+    cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t input_desc,
+    const void* logits_in, const cnnlTensorDescriptor_t label_desc,
+    const void* labels_in, const cnnlTensorDescriptor_t loss_out_desc,
+    void* loss_out, const cnnlTensorDescriptor_t back_out_desc,
+    void* back_out) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSoftmaxCrossEntropyWithLogits_v2(
+      handle, mode, prefer, input_desc, logits_in, label_desc, labels_in,
+      loss_out_desc, loss_out, back_out_desc, back_out));
+}
+
+/* static */ void MLUCnnl::Reduce(
+    const ExecutionContext& ctx, const bool need_workspace,
+    const cnnlReduceDescriptor_t reduction_desc, const void* alpha,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const size_t indices_size, void* indices, const void* beta,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size = 0;
+  void* workspace_ptr = nullptr;
+  Tensor workspace;
+  if (need_workspace) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetReduceOpWorkspaceSize(
+        handle, input_desc, output_desc, reduction_desc, &workspace_size));
+
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+        {static_cast<int64_t>(workspace_size)}, dev_ctx);
+
+    workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+  }
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlReduce(
+      handle, reduction_desc, workspace_ptr, workspace_size, alpha, input_desc,
+      input, indices_size, indices, beta, output_desc, output));
+}
+
+/* static */ void MLUCnnl::FloorDiv(
+    const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
+    const cnnlTensorDescriptor_t input1_desc, const void* input1,
+    const cnnlTensorDescriptor_t input2_desc, const void* input2,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetFloorDivWorkspaceSize(
+      handle, input1_desc, input2_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlFloorDiv_v2(handle, prefer, input1_desc, input1, input2_desc, input2,
+                      output_desc, output, workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::FloorMod(const ExecutionContext& ctx,
+                                    const cnnlTensorDescriptor_t input1_desc,
+                                    const void* input1,
+                                    const cnnlTensorDescriptor_t input2_desc,
+                                    const void* input2,
+                                    const cnnlTensorDescriptor_t output_desc,
+                                    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetFloorModWorkspaceSize(
+      handle, input1_desc, input2_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlFloorMod(handle, input1_desc, input1, input2_desc, input2,
+                   output_desc, output, workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::Maximum(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t input1_desc,
+                                   const void* input1,
+                                   const cnnlTensorDescriptor_t input2_desc,
+                                   const void* input2,
+                                   const cnnlTensorDescriptor_t output_desc,
+                                   void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetMaximumWorkspaceSize(handle, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlMaximum(handle, input1_desc, input1, input2_desc, input2, output_desc,
+                  output, workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::Minimum(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t input1_desc,
+                                   const void* input1,
+                                   const cnnlTensorDescriptor_t input2_desc,
+                                   const void* input2,
+                                   const cnnlTensorDescriptor_t output_desc,
+                                   void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetMinimumWorkspaceSize(handle, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlMinimum(handle, input1_desc, input1, input2_desc, input2, output_desc,
+                  output, workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::PowR(
+    const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
+    const cnnlTensorDescriptor_t input1_desc, const void* input1,
+    const cnnlTensorDescriptor_t input2_desc, const void* input2,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetPowRWorkspaceSize(
+      handle, input1_desc, input2_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlPowR_v2(handle, prefer, input1_desc, input1,
+                                         input2_desc, input2, workspace_ptr,
+                                         workspace_size, output_desc, output));
+}
+
+/* static */ void MLUCnnl::DivNoNan(
+    const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
+    const cnnlTensorDescriptor_t input1_desc, const void* input1,
+    const cnnlTensorDescriptor_t input2_desc, const void* input2,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetDivNoNanWorkspaceSize(
+      handle, input1_desc, input2_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlDivNoNan_v2(handle, prefer, input1_desc, input1, input2_desc, input2,
+                      workspace_ptr, workspace_size, output_desc, output));
+}
+
+/* static */ void MLUCnnl::SquaredDifference(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input1_desc,
+    const void* input1, const cnnlTensorDescriptor_t input2_desc,
+    const void* input2, const cnnlTensorDescriptor_t output_desc,
+    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSquaredDifferenceWorkspaceSize(
+      handle, input1_desc, input2_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSquaredDifference(
+      handle, input1_desc, input1, input2_desc, input2, output_desc, output,
+      workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::L2Loss(const ExecutionContext& ctx,
+                                  const cnnlTensorDescriptor_t input_desc,
+                                  const void* input, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlL2Loss(handle, input_desc, input, output));
+}
+
+/* static */ void MLUCnnl::Abs(const ExecutionContext& ctx,
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlAbs(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Neg(const ExecutionContext& ctx,
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlNegTensor(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Floor(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlFloor(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Ceil(const ExecutionContext& ctx,
+                                const cnnlTensorDescriptor_t input_desc,
+                                const void* input,
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlCeil(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::IsNan(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlIsNan(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Square(const ExecutionContext& ctx,
+                                  const cnnlTensorDescriptor_t input_desc,
+                                  const void* input,
+                                  const cnnlTensorDescriptor_t output_desc,
+                                  void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSquare(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Sqrt(const ExecutionContext& ctx,
+                                cnnlComputationPreference_t prefer,
+                                const cnnlTensorDescriptor_t input_desc,
+                                const void* input,
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSqrt_v2(handle, prefer, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Rsqrt(const ExecutionContext& ctx,
+                                 cnnlComputationPreference_t prefer,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlRsqrt_v2(handle, prefer, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Cos(const ExecutionContext& ctx,
+                               cnnlComputationPreference_t prefer,
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlCos_v2(handle, prefer, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Sin(const ExecutionContext& ctx,
+                               cnnlComputationPreference_t prefer,
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSin_v2(handle, prefer, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::TrigonForward(
+    const ExecutionContext& ctx, const cnnlTrigonDescriptor_t trigon_desc,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTrigonForward(handle, trigon_desc, input_desc,
+                                               input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Exp(const ExecutionContext& ctx,
+                               cnnlComputationPreference_t prefer,
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlExp_v2(handle, prefer, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Sign(const ExecutionContext& ctx,
+                                const cnnlTensorDescriptor_t input_desc,
+                                const void* input,
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSign(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::IsFinite(const ExecutionContext& ctx,
+                                    const cnnlTensorDescriptor_t input_desc,
+                                    const void* input,
+                                    const cnnlTensorDescriptor_t output_desc,
+                                    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlIsFinite(handle, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::IsNanInf(const ExecutionContext& ctx,
+                                    const cnnlTensorDescriptor_t input_desc,
+                                    const void* input, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  // TODO(CTR-3849): output type should be void*, but now bool*.
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlNanInf(handle, input_desc, input, reinterpret_cast<bool*>(output)));
+}
+
+/* static */ void MLUCnnl::Erf(const ExecutionContext& ctx,
+                               cnnlComputationPreference_t prefer,
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlErf_v2(handle, prefer, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Log1p(const ExecutionContext& ctx,
+                                 cnnlComputationPreference_t prefer,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlLog1p(handle, prefer, input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::LogicalNot(const ExecutionContext& ctx,
+                                      const cnnlTensorDescriptor_t input_desc,
+                                      const void* input,
+                                      const cnnlTensorDescriptor_t output_desc,
+                                      void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlLogicOp(handle, CNNL_LOGIC_OP_NOT, input_desc,
+                                         input, input_desc, input, nullptr, 0,
+                                         output_desc, output));
+}
+
+/* static */ void MLUCnnl::DynamicStitch(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t* indices_desc,
+    const int** indices, const cnnlTensorDescriptor_t* data_desc,
+    const void** data, const int size, int* indices_dims,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetDynamicStitchWorkspaceSize(handle, size, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlDynamicStitch(
+      handle, indices_desc, indices, data_desc, data, size, indices_dims,
+      workspace_ptr, workspace_size, output_desc, output));
+}
+
+/* static */ void MLUCnnl::CropAndResize(
+    const ExecutionContext& ctx, const std::string method_name,
+    const float extrapolation_value, const cnnlTensorDescriptor_t image_desc,
+    const void* image, const cnnlTensorDescriptor_t boxes_desc,
+    const void* boxes, const cnnlTensorDescriptor_t box_index_desc,
+    const void* box_index, const cnnlTensorDescriptor_t output_desc,
+    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  cnnlCropAndResizeMode_t mode = CNNL_CROP_AND_RESIZE_BILINEAR;
+  if (method_name == "nearest") {
+    mode = CNNL_CROP_AND_RESIZE_NEAREST;
+  }
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCropAndResize(
+      handle, image_desc, image, boxes_desc, boxes, box_index_desc, box_index,
+      mode, extrapolation_value, output_desc, output));
+}
+
+/* static */ void MLUCnnl::CropAndResizeBackwardImage(
+    const ExecutionContext& ctx, const std::string method_name,
+    const cnnlTensorDescriptor_t grads_desc, const void* grads,
+    const cnnlTensorDescriptor_t boxes_desc, const void* boxes,
+    const cnnlTensorDescriptor_t box_idx_desc, const void* box_idx,
+    const cnnlTensorDescriptor_t grads_image_desc, void* grads_image) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  cnnlCropAndResizeMode_t mode = CNNL_CROP_AND_RESIZE_BILINEAR;
+  if (method_name == "nearest") {
+    mode = CNNL_CROP_AND_RESIZE_NEAREST;
+  }
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCropAndResizeBackwardImage(
+      handle, grads_desc, grads, boxes_desc, boxes, box_idx_desc, box_idx, mode,
+      grads_image_desc, grads_image));
+}
+
+/* static */ void MLUCnnl::CropAndResizeBackwardBoxes(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t image_desc,
+    const void* image, const cnnlTensorDescriptor_t boxes_desc,
+    const void* boxes, const cnnlTensorDescriptor_t box_idx_desc,
+    const void* box_idx, const cnnlTensorDescriptor_t output_desc,
+    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  cnnlCropAndResizeMode_t mode = CNNL_CROP_AND_RESIZE_BILINEAR;
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCropAndResizeBackwardBoxes(
+      handle, input_desc, input, image_desc, image, boxes_desc, boxes,
+      box_idx_desc, box_idx, output_desc, output, mode));
+}
+
+/* static */ void MLUCnnl::Interp(
+    const ExecutionContext& ctx, const cnnlInterpMode_t mode,
+    const bool align_corners, const bool half_pixel_centers,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlInterp_v2(handle, align_corners, half_pixel_centers, mode, NULL, true,
+                    input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::InterpBackward(
+    const ExecutionContext& ctx, const cnnlInterpBackwardMode_t mode,
+    const bool align_corners, const bool half_pixel_centers,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlInterpBackward(handle, align_corners, half_pixel_centers, mode,
+                         input_desc, input, output_desc, output));
+}
+
+/* static */ void MLUCnnl::Cast(const ExecutionContext& ctx,
+                                cnnlCastDataType_t cast_type,
+                                const cnnlTensorDescriptor_t input_desc,
+                                const void* input,
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCastDataType(handle, input_desc, input,
+                                              cast_type, output_desc, output));
+}
+
+/* static */ void MLUCnnl::PoolingBackward(
+    const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc,
+    const void* alpha, const cnnlTensorDescriptor_t y_desc, const void* y,
+    const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y,
+    const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta,
+    const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlPoolingBackward(
+      handle, const_cast<cnnlPoolingDescriptor_t>(pooling_desc), alpha, y_desc,
+      y, diff_y_desc, diff_y, x_desc, x, beta, diff_x_desc, diff_x));
+}
+
+/* static */ void MLUCnnl::NonMaxSuppression(
+    const ExecutionContext& ctx, const cnnlNmsDescriptor_t nms_desc,
+    const cnnlTensorDescriptor_t boxes_desc, const void* boxes,
+    const cnnlTensorDescriptor_t confidence_desc, const void* confidence,
+    const cnnlTensorDescriptor_t output_desc, void* output, void* output_size) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetNmsWorkspaceSize_v2(handle, confidence_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlNms_v2(
+      handle, nms_desc, boxes_desc, boxes, confidence_desc, confidence,
+      workspace_ptr, workspace_size, output_desc, output, output_size));
+}
+
+/* static */ void MLUCnnl::PoolingIndex(
+    const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc,
+    const cnnlTensorDescriptor_t x_desc, const void* x,
+    const cnnlTensorDescriptor_t y_desc, void* y) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlPoolingIndex(
+      handle, const_cast<cnnlPoolingDescriptor_t>(pooling_desc), x_desc, x,
+      y_desc, y));
+}
+
+/* static */ void MLUCnnl::SpaceToBatch(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t output_desc, void* output,
+    const int64_t block_shape[]) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSpace2batchWorkspaceSize(
+      handle, input_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  cnnlSpaceBatchParam_t param = {static_cast<uint32_t>(block_shape[0]),
+                                 static_cast<uint32_t>(block_shape[1])};
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSpace2batch(handle, input_desc, input,
+                                             output_desc, output, param,
+                                             workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::SpaceToBatchNd(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
+    const void* input, cnnlSpaceBatchNdDescriptor_t param,
+    void* extra_device_input, size_t extra_host_input,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSpace2batchNd_v2(handle, input_desc, input, output_desc, output,
+                           param, extra_device_input, extra_host_input));
+}
+
+/* static */ void MLUCnnl::FusedBatchNorm(
+    const ExecutionContext& ctx, const bool is_training,
+    const cnnlTensorDescriptor_t x_desc, const void* x,
+    const cnnlTensorDescriptor_t scale_desc, const void* scale,
+    const void* offset, const void* running_mean_input,
+    const void* running_variance_input, float epsilon, float momentum,
+    const cnnlTensorDescriptor_t output_desc, void* output,
+    void* running_mean_output, void* running_var_output,
+    void* saved_batch_mean_output, void* saved_batch_var_output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  if (is_training) {
+    /*
+     *  If in Paddle, running_mean_output = momentum * runnning_mean_input +
+     *  (1 - momentum) * batch_mean. However, In CNNL,
+     *  running_mean_output = (1 - momentum) * running_mean_input +
+     *  momentum * batch_mean. So we pass (1.0 - momentum) to momentum param.
+     */
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchNormForwardTraining(
+        handle, NULL, NULL, x_desc, x, scale_desc, scale, offset,
+        running_mean_output, running_var_output, epsilon, 1.0 - momentum,
+        output_desc, output, saved_batch_mean_output, saved_batch_var_output));
+  } else {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchNormForwardInference(
+        handle, NULL, NULL, x_desc, x, scale_desc, scale, offset,
+        running_mean_input, running_variance_input, epsilon, output_desc,
+        output));
+  }
+}
+
+/* static */ void MLUCnnl::FusedBatchNormGrad(
+    const ExecutionContext& ctx, const bool is_training,
+    const cnnlTensorDescriptor_t y_backprop_desc, const void* y_backprop,
+    const cnnlTensorDescriptor_t x_desc, const void* x,
+    const cnnlTensorDescriptor_t scale_desc, const void* scale,
+    const void* saved_mean, const void* saved_var, float epsilon,
+    const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop,
+    void* scale_backprop, void* offset_backprop) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  if (is_training) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchNormBackward(
+        handle, NULL, NULL, NULL, NULL, x_desc, x, y_backprop_desc, y_backprop,
+        scale_desc, scale, saved_mean, saved_var, epsilon, x_backprop_desc,
+        x_backprop, scale_backprop, offset_backprop));
+  } else {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlFrozenBatchNormBackward(
+        handle, x_desc, x, y_backprop_desc, y_backprop, scale_desc, scale,
+        saved_mean, saved_var, epsilon, x_backprop_desc, x_backprop,
+        scale_backprop, offset_backprop));
+  }
+}
+
+/* static */ void MLUCnnl::QuantizeParam(
+    const ExecutionContext& ctx, const cnnlQuantizeMode_t mode,
+    const int bitwidth, const cnnlTensorDescriptor_t input_desc,
+    const void* input, void* position, void* scale, void* offset) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetQuantizeParamWorkspaceSize(handle, input_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeParam(
+      handle, mode, input_desc, input, bitwidth, workspace_ptr, workspace_size,
+      position, scale, offset));
+}
+
+/* static */ void MLUCnnl::Conv2D(
+    const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+    const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip,
+    const void* input_position, const void* input_scale,
+    const void* input_offset, const void* filter_position,
+    const void* filter_scale, const void* filter_offset,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t filter_desc, const void* filter,
+    const cnnlTensorDescriptor_t bias_desc, const void* bias,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(input_desc, dt_onchip));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(filter_desc, dt_onchip));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(output_desc, tensor_dtype));
+
+  cnnlConvolutionForwardAlgo_t algo;
+  const cnnlConvolutionFwdPreference_t preference =
+      CNNL_CONVOLUTION_FWD_FASTEST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionForwardAlgorithm(
+      handle, conv_desc, input_desc, filter_desc, output_desc, preference,
+      &algo));
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionForwardWorkspaceSize(
+      handle, input_desc, filter_desc, output_desc, bias_desc, conv_desc, algo,
+      &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeConvolutionForward(
+      handle, conv_desc, algo, nullptr /*alpha*/, input_desc, input,
+      input_position, input_scale, input_offset, filter_desc, filter,
+      filter_position, filter_scale, filter_offset, bias_desc, bias,
+      workspace_ptr, workspace_size, nullptr /*beta*/, output_desc, output));
+}
+
+/* static */ void MLUCnnl::FusedConvBNQuantify(
+    const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc,
+    const void* epsilon_ptr, const int fused_ops_number,
+    const cnnlDataType_t tensor_dtype, const int input_position,
+    const float input_scale, const int filter_position,
+    const float filter_scale, const cnnlTensorDescriptor_t scale_desc,
+    const void* scale_ptr, const cnnlTensorDescriptor_t offset_desc,
+    const void* offset_ptr, const cnnlTensorDescriptor_t mean_desc,
+    const void* mean_ptr, const cnnlTensorDescriptor_t variance_desc,
+    const void* variance_ptr, const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t filter_desc,
+    const void* filter, const cnnlTensorDescriptor_t output_desc,
+    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(input_desc, CNNL_DTYPE_INT16));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(filter_desc, CNNL_DTYPE_INT16));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(output_desc, tensor_dtype));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetTensorDescriptorPositionAndScale(
+      input_desc, input_position, input_scale));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetTensorDescriptorPositionAndScale(
+      filter_desc, filter_position, filter_scale));
+
+  cnnlFusedOpsPlan_t fusion_plan = nullptr;
+  cnnlActivationDescriptor_t active_desc = nullptr;
+  cnnlFusedOpsConstParamPack_t cparam_pack = nullptr;
+  cnnlFusedOpsVariantParamPack_t vparam_pack = nullptr;
+  cnnlConvolutionForwardAlgo_t algo;
+  cnnlFusedOps_t fusion_type = CNNL_CONV_SCALE_BN_ACTIVATION;
+  cnnlConvolutionCastMode_t cast_mode = CNNL_OFFLINE_SYMMETRIC_QUANTIZE;
+  cnnlConvolutionFwdPreference_t preference = CNNL_CONVOLUTION_FWD_FASTEST;
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionForwardAlgorithm(
+      handle, conv_desc, input_desc, filter_desc, output_desc, preference,
+      &algo));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateFusedOpsPlan(&fusion_plan, fusion_type));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlCreateFusedOpsConstParamPack(&cparam_pack, fusion_type));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlCreateFusedOpsVariantParamPack(&vparam_pack, fusion_type));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_XDESC, input_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetFusedOpsVariantParamPackAttribute(vparam_pack, CNNL_PTR_X, input));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_WDESC, filter_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+      vparam_pack, CNNL_PTR_W, filter));
+
+  if (fused_ops_number > 1) {
+    cnnlCreateActivationDescriptor(&active_desc);
+    cnnlNanPropagation_t nan_opt = CNNL_NOT_PROPAGATE_NAN;
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor(
+        active_desc, CNNL_ACTIVATION_RELU, nan_opt, 0.0));
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+        cparam_pack, CNNL_ACTIVATION_DESC, active_desc));
+  }
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC, scale_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+      vparam_pack, CNNL_PTR_BN_WEIGHT, scale_ptr));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC, offset_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+      vparam_pack, CNNL_PTR_BN_BIAS, offset_ptr));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC, mean_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+      vparam_pack, CNNL_PTR_BN_MEAN, mean_ptr));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC, variance_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+      vparam_pack, CNNL_PTR_BN_VAR, variance_ptr));
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_CONV_DESC, conv_desc));
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_SCALAR_CONV_FWD_ALGO, &algo));
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_SCALAR_CONV_FWD_CAST_MODE, &cast_mode));
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+      vparam_pack, CNNL_SCALAR_BN_EPSILON, epsilon_ptr));
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute(
+      cparam_pack, CNNL_YDESC, output_desc));
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+      vparam_pack, CNNL_PTR_Y, output));
+
+  // get workspace size
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlMakeFusedOpsPlan(handle, fusion_plan, cparam_pack, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  if (workspace_size > 0) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+        vparam_pack, CNNL_PTR_WORKSPACE, workspace_ptr));
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute(
+        vparam_pack, CNNL_SCALAR_WORKSPACE_SIZE, &workspace_size));
+  }
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlFusedOpsExecute(handle, fusion_plan, vparam_pack));
+
+  if (active_desc) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyActivationDescriptor(active_desc));
+  }
+
+  if (cparam_pack) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyFusedOpsConstParamPack(cparam_pack));
+  }
+
+  if (vparam_pack) {
+    PADDLE_ENFORCE_MLU_SUCCESS(
+        cnnlDestroyFusedOpsVariantParamPack(vparam_pack));
+  }
+
+  if (fusion_plan) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyFusedOpsPlan(fusion_plan));
+  }
+}
+
+/* static */ void MLUCnnl::ConvBackpropInput(
+    const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+    const cnnlTensorDescriptor_t filter_desc, const void* filter,
+    const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+    const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  cnnlConvolutionBwdDataAlgo_t algo;
+  const cnnlConvolutionBwdDataPreference_t preference =
+      CNNL_CONVOLUTION_BWD_DATA_FASTEST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardDataAlgorithm(
+      handle, filter_desc, out_backprop_desc, conv_desc, in_backprop_desc,
+      preference, &algo));
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardDataWorkspaceSize(
+      handle, filter_desc, out_backprop_desc, conv_desc, in_backprop_desc, algo,
+      &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlConvolutionBackwardData(
+      handle, nullptr /*alpha*/, filter_desc, filter, out_backprop_desc,
+      out_backprop, conv_desc, algo, workspace_ptr, workspace_size,
+      nullptr /*beta*/, in_backprop_desc, in_backprop));
+}
+
+/* static */ void MLUCnnl::QuantizeConvBackpropInput(
+    const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+    const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip,
+    const void* filter_position, const void* filter_scale,
+    const void* filter_offset, const void* out_backprop_position,
+    const void* out_backprop_scale, const void* out_backprop_offset,
+    const cnnlTensorDescriptor_t filter_desc, const void* filter,
+    const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+    const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(filter_desc, dt_onchip));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(out_backprop_desc, dt_onchip));
+
+  cnnlConvolutionBwdDataAlgo_t algo;
+  const cnnlConvolutionBwdDataPreference_t preference =
+      CNNL_CONVOLUTION_BWD_DATA_FASTEST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardDataAlgorithm(
+      handle, filter_desc, out_backprop_desc, conv_desc, in_backprop_desc,
+      preference, &algo));
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardDataWorkspaceSize(
+      handle, filter_desc, out_backprop_desc, conv_desc, in_backprop_desc, algo,
+      &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeConvolutionBackwardData(
+      handle, nullptr /*alpha*/, filter_desc, filter, filter_position,
+      filter_scale, filter_offset, out_backprop_desc, out_backprop,
+      out_backprop_position, out_backprop_scale, out_backprop_offset, conv_desc,
+      algo, workspace_ptr, workspace_size, nullptr /*beta*/, in_backprop_desc,
+      in_backprop));
+}
+
+/* static */ void MLUCnnl::ConvBackpropFilter(
+    const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+    const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  cnnlConvolutionBwdFilterAlgo_t algo;
+  const cnnlConvolutionBwdFilterPreference_t preference =
+      CNNL_CONVOLUTION_BWD_FILTER_FASTEST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardFilterAlgorithm(
+      handle, conv_desc, input_desc, out_backprop_desc, filter_backprop_desc,
+      preference, &algo));
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardFilterWorkspaceSize(
+      handle, input_desc, out_backprop_desc, filter_backprop_desc, conv_desc,
+      algo, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlConvolutionBackwardFilter(
+      handle, nullptr /*alpha*/, input_desc, input, out_backprop_desc,
+      out_backprop, conv_desc, algo, workspace_ptr, workspace_size,
+      nullptr /*beta*/, filter_backprop_desc, filter_backprop));
+}
+
+/* static */ void MLUCnnl::QuantizeConvBackpropFilter(
+    const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+    const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip,
+    const void* input_position, const void* input_scale,
+    const void* input_offset, const void* out_backprop_position,
+    const void* out_backprop_scale, const void* out_backprop_offset,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+    const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(input_desc, dt_onchip));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTensorDescriptorOnchipDataType(out_backprop_desc, dt_onchip));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetTensorDescriptorOnchipDataType(
+      filter_backprop_desc, tensor_dtype));
+
+  cnnlConvolutionBwdFilterAlgo_t algo;
+  const cnnlConvolutionBwdFilterPreference_t preference =
+      CNNL_CONVOLUTION_BWD_FILTER_FASTEST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardFilterAlgorithm(
+      handle, conv_desc, input_desc, out_backprop_desc, filter_backprop_desc,
+      preference, &algo));
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardFilterWorkspaceSize(
+      handle, input_desc, out_backprop_desc, filter_backprop_desc, conv_desc,
+      algo, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeConvolutionBackwardFilter(
+      handle, nullptr /*alpha*/, input_desc, input, input_position, input_scale,
+      input_offset, out_backprop_desc, out_backprop, out_backprop_position,
+      out_backprop_scale, out_backprop_offset, conv_desc, algo, workspace_ptr,
+      workspace_size, nullptr /*beta*/, filter_backprop_desc, filter_backprop));
+}
+
+/* static */ void MLUCnnl::QuantizeMatMul(
+    const ExecutionContext& ctx, const bool transpose_a, const bool transpose_b,
+    const cnnlTensorDescriptor_t a_desc, const void* a, const void* a_position,
+    const void* a_scale, const void* a_offset,
+    const cnnlTensorDescriptor_t b_desc, const void* b, const void* b_position,
+    const void* b_scale, const void* b_offset, const cnnlDataType_t quant_type,
+    const cnnlDataType_t data_type, const cnnlTensorDescriptor_t output_desc,
+    void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  // Set onchip data type
+  cnnlSetTensorDescriptorOnchipDataType(a_desc, quant_type);
+  cnnlSetTensorDescriptorOnchipDataType(b_desc, quant_type);
+  cnnlSetTensorDescriptorOnchipDataType(output_desc, data_type);
+
+  // Create and set matmul descriptor
+  cnnlMatMulDescriptor_t matmul_desc;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatMulDescCreate(&matmul_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetMatMulDescAttr(
+      matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE, &data_type, sizeof(int)));
+  int transpose_a_int = static_cast<int>(transpose_a);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetMatMulDescAttr(
+      matmul_desc, CNNL_MATMUL_DESC_TRANSA, &(transpose_a_int), sizeof(int)));
+  int transpose_b_int = static_cast<int>(transpose_b);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetMatMulDescAttr(
+      matmul_desc, CNNL_MATMUL_DESC_TRANSB, &(transpose_b_int), sizeof(int)));
+
+  // Create and get matmul algorithim
+  cnnlMatMulAlgo_t algo;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatMulAlgoCreate(&algo));
+  const cnnlMatMulPreference_t preference = CNNL_MATMUL_FASTEST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeMatMulAlgorithm(
+      handle, matmul_desc, a_desc, b_desc, output_desc, preference, &algo));
+
+  // Get workspace
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeMatMulWorkspaceSize(
+      handle, matmul_desc, a_desc, b_desc, output_desc, algo, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  // Compute
+  float alpha = 1.0;
+  float beta = 0.0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeMatMul(
+      handle, matmul_desc, reinterpret_cast<void*>(&alpha), a_desc, a,
+      a_position, a_scale, a_offset, b_desc, b, b_position, b_scale, b_offset,
+      reinterpret_cast<void*>(&beta), output_desc, output, algo, workspace_ptr,
+      workspace_size));
+
+  // Destroy matmul descriptor and algorithim
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatMulDescDestroy(matmul_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatMulAlgoDestroy(algo));
+}
+
+/* static */ void MLUCnnl::QuantizeBatchMatMul(
+    const ExecutionContext& ctx, const bool adj_x, const bool adj_y,
+    const cnnlTensorDescriptor_t in0_desc, const void* in0,
+    const void* in0_position, const void* in0_scale, const void* in0_offset,
+    const cnnlTensorDescriptor_t in1_desc, const void* in1,
+    const void* in1_position, const void* in1_scale, const void* in1_offset,
+    const cnnlDataType_t quant_type, const cnnlDataType_t data_type,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  // Set onchip data type
+  cnnlSetTensorDescriptorOnchipDataType(in0_desc, quant_type);
+  cnnlSetTensorDescriptorOnchipDataType(in1_desc, quant_type);
+  cnnlSetTensorDescriptorOnchipDataType(output_desc, data_type);
+
+  // Create and set batch matmul descriptor
+  cnnlBatchMatMulDescriptor_t bmm_desc;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulDescCreate(&bmm_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulDescAttr(
+      bmm_desc, CNNL_BMM_DESC_COMPUTE_TYPE, &data_type, sizeof(int)));
+  int transpose_a_int = static_cast<int>(adj_x);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulDescAttr(
+      bmm_desc, CNNL_BMM_DESC_TRANSA, &(transpose_a_int), sizeof(int)));
+  int transpose_b_int = static_cast<int>(adj_y);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulDescAttr(
+      bmm_desc, CNNL_BMM_DESC_TRANSB, &(transpose_b_int), sizeof(int)));
+
+  // Create and get batch matmul algorithim
+  cnnlBatchMatMulAlgo_t algo;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulAlgoCreate(&algo));
+  const cnnlBatchMatMulPreference_t preference = CNNL_BMM_FASTEST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeBatchMatMulAlgorithm(
+      handle, bmm_desc, in0_desc, in1_desc, output_desc, preference, &algo));
+
+  // Get workspace
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeBatchMatMulWorkspaceSize(
+      handle, bmm_desc, in0_desc, in1_desc, output_desc, algo,
+      &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  // Compute
+  float alpha = 1.0;
+  float beta = 0.0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeBatchMatMul(
+      handle, bmm_desc, reinterpret_cast<void*>(&alpha), in0_desc, in0,
+      in0_position, in0_scale, in0_offset, in1_desc, in1, in1_position,
+      in1_scale, in1_offset, reinterpret_cast<void*>(&beta), output_desc,
+      output, algo, workspace_ptr, workspace_size));
+
+  // Destroy matmul descriptor and algorithim
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulDescDestroy(bmm_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulAlgoDestroy(algo));
+}
+
+/* static */ void MLUCnnl::QuantizeBatchMatMulBCast(
+    const ExecutionContext& ctx, const bool adj_x, const bool adj_y,
+    const cnnlTensorDescriptor_t in0_desc, const void* in0,
+    const void* in0_position, const void* in0_scale, const void* in0_offset,
+    const cnnlTensorDescriptor_t in1_desc, const void* in1,
+    const void* in1_position, const void* in1_scale, const void* in1_offset,
+    const cnnlDataType_t quant_type, const cnnlDataType_t data_type,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  // Set onchip data type
+  cnnlSetTensorDescriptorOnchipDataType(in0_desc, quant_type);
+  cnnlSetTensorDescriptorOnchipDataType(in1_desc, quant_type);
+  cnnlSetTensorDescriptorOnchipDataType(output_desc, data_type);
+
+  // Create and set batch matmul descriptor
+  cnnlBatchMatMulBCastDescriptor_t bmm_bcast_desc;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCastDescCreate(&bmm_bcast_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulBCastDescAttr(
+      bmm_bcast_desc, CNNL_BMM_BCAST_DESC_COMPUTE_TYPE, &data_type,
+      sizeof(int)));
+  int transpose_a_int = static_cast<int>(adj_x);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulBCastDescAttr(
+      bmm_bcast_desc, CNNL_BMM_BCAST_DESC_TRANSA, &(transpose_a_int),
+      sizeof(int)));
+  int transpose_b_int = static_cast<int>(adj_y);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulBCastDescAttr(
+      bmm_bcast_desc, CNNL_BMM_BCAST_DESC_TRANSB, &(transpose_b_int),
+      sizeof(int)));
+
+  // Create and get batch matmul algorithim
+  cnnlBatchMatMulBCastAlgo_t algo;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCastAlgoCreate(&algo));
+  const cnnlBatchMatMulBCastPreference_t preference = CNNL_BMM_BCAST_FASTEST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeBatchMatMulBCastAlgorithm(
+      handle, bmm_bcast_desc, in0_desc, in1_desc, output_desc, preference,
+      &algo));
+
+  // Get workspace
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeBatchMatMulBCastWorkspaceSize(
+      handle, bmm_bcast_desc, in0_desc, in1_desc, output_desc, algo,
+      &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  // Compute
+  float alpha = 1.0;
+  float beta = 0.0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeBatchMatMulBCast(
+      handle, bmm_bcast_desc, reinterpret_cast<void*>(&alpha), in0_desc, in0,
+      in0_position, in0_scale, in0_offset, in1_desc, in1, in1_position,
+      in1_scale, in1_offset, reinterpret_cast<void*>(&beta), output_desc,
+      output, algo, workspace_ptr, workspace_size));
+
+  // Destroy matmul descriptor and algorithim
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCastDescDestroy(bmm_bcast_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCastAlgoDestroy(algo));
+}
+
+/* static */ void MLUCnnl::Transpose(
+    const ExecutionContext& ctx, const std::vector<int> perm,
+    const int input_dim, const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  cnnlTransposeDescriptor_t perm_desc;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateTransposeDescriptor(&perm_desc));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetTransposeDescriptor(perm_desc, input_dim, perm.data()));
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetTransposeWorkspaceSize(
+      handle, input_desc, perm_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTranspose_v2(handle, perm_desc, input_desc,
+                                              input, output_desc, output,
+                                              workspace_ptr, workspace_size));
+  if (perm_desc) {
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyTransposeDescriptor(perm_desc));
+  }
+}
+
+/* static */ void MLUCnnl::MatrixBandPart(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t data_desc,
+    const void* input, const int num_lower, const int num_upper, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatrixBandPart(handle, data_desc, input,
+                                                num_lower, num_upper, output));
+}
+
+/* static */ void MLUCnnl::NumTrue(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t x_desc,
+                                   const void* x, Tensor index,
+                                   uint32_t* num_true) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetNumTrueWorkspaceSize(handle, x_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  index = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* index_ptr = index.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlNumTrue(
+      handle, x_desc, x, static_cast<uint32_t*>(index_ptr), num_true));
+}
+
+/* static */ void MLUCnnl::Where(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t x_desc,
+                                 const void* x, const uint32_t* strides,
+                                 const uint32_t* index,
+                                 const cnnlTensorDescriptor_t y_desc, int* y,
+                                 const bool as_tuple) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlWhere(handle, x_desc, x, strides, index, y_desc, y, as_tuple));
+}
+
+/* static */ void MLUCnnl::InTopK(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t predictions_desc,
+    const void* predictions, const cnnlTensorDescriptor_t targets_desc,
+    const void* targets, const cnnlTensorDescriptor_t k_desc, const void* k,
+    const int k_int, const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlInTopK(handle, predictions_desc, predictions,
+                                        targets_desc, targets, k_desc, k, k_int,
+                                        output_desc, output));
+}
+
+/* static */ void MLUCnnl::ScatterNd(const ExecutionContext& ctx,
+                                     const cnnlTensorDescriptor_t indices_desc,
+                                     const void* indices,
+                                     const cnnlTensorDescriptor_t updates_desc,
+                                     const void* updates,
+                                     const cnnlTensorDescriptor_t output_desc,
+                                     void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlScatterNd(handle, indices_desc, indices,
+                                           updates_desc, updates, output_desc,
+                                           output));
+}
+
+/* static */ void MLUCnnl::BitWise(
+    const ExecutionContext& ctx, const cnnlBitComputeOp_t optype,
+    const cnnlTensorDescriptor_t input1_desc, const void* input1,
+    const cnnlTensorDescriptor_t input2_desc, const void* input2,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetBitComputeWorkspaceSize(
+      handle, input1_desc, input2_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlBitCompute_v2(
+      handle, optype, input1_desc, input1, input2_desc, input2, output_desc,
+      output, workspace_ptr, workspace_size));
+}
+
+/* static */ void MLUCnnl::QR(const ExecutionContext& ctx,
+                              const cnnlTensorDescriptor_t a_desc,
+                              const void* a,
+                              const cnnlTensorDescriptor_t q_desc, void* q,
+                              const cnnlTensorDescriptor_t r_desc, void* r,
+                              const bool some) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetQRWorkspaceSize(handle, a_desc, some, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlQR(handle, a_desc, a, q_desc, q, r_desc, r,
+                                    workspace_ptr, workspace_size, some));
+}
+
+/* static */ void MLUCnnl::Reciprocal(const ExecutionContext& ctx,
+                                      const cnnlTensorDescriptor_t input_desc,
+                                      const void* input,
+                                      const cnnlTensorDescriptor_t output_desc,
+                                      void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlReciprocal(handle, input_desc, input, output_desc, output));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index e0a2735e0ea4d..ab398a92c2972 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -30,7 +30,20 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
+using ExecutionContext = framework::ExecutionContext;
 using DeviceContextPool = platform::DeviceContextPool;
+using MLUDeviceContext = platform::MLUDeviceContext;
+
+enum MLULogicMethod {
+  CNNL_LOGIC_OP_EQ = 0,
+  CNNL_LOGIC_OP_NE = 1,
+  CNNL_LOGIC_OP_GT = 2,
+  CNNL_LOGIC_OP_GE = 3,
+  CNNL_LOGIC_OP_LT = 4,
+  CNNL_LOGIC_OP_LE = 5,
+  CNNL_LOGIC_OP_AND = 6,
+  CNNL_LOGIC_OP_OR = 7,
+};
 
 template <typename T>
 inline cnnlDataType_t ToCnnlDataType(const T& t) {
@@ -76,6 +89,14 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
+static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
+  return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
+}
+
+static const MLUDeviceContext& GetDevCtxFromCTX(const ExecutionContext& ctx) {
+  return ctx.template device_context<MLUDeviceContext>();
+}
+
 cnnlDeviceType_t GetCnnlDev(int dev_ordinal);
 
 using CnnlTensorDesc = cnnlTensorDescriptor_t;
@@ -146,22 +167,914 @@ class MLUCnnlActivationDesc {
   cnnlActivationDescriptor_t active_desc_ = nullptr;
 };
 
+class MLUCnnlPoolingDesc {
+ public:
+  MLUCnnlPoolingDesc(const MLUCnnlPoolingDesc& desc) = delete;
+  MLUCnnlPoolingDesc& operator=(const MLUCnnlPoolingDesc& desc) = delete;
+
+  MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
+                     const cnnlNanPropagation_t maxpooling_nan_opt,
+                     int window_rows, int window_cols, int64_t pad_up,
+                     int64_t pad_down, int64_t pad_left, int64_t pad_right,
+                     int row_stride, int col_stride);
+
+  MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
+                     const cnnlNanPropagation_t maxpooling_nan_opt,
+                     const int tensor_rank, const std::vector<int>& window,
+                     const std::vector<int>& padding,
+                     const std::vector<int>& stride);
+
+  const cnnlPoolingDescriptor_t get() const;
+
+  ~MLUCnnlPoolingDesc();
+
+ private:
+  cnnlPoolingDescriptor_t pooling_desc_ = nullptr;
+};
+
+class MLUCnnlRandomGeneratorDesc {
+ public:
+  MLUCnnlRandomGeneratorDesc(const bool is_mlu200, const int seed);
+  const cnnlRandGenerator_t get() const;
+  ~MLUCnnlRandomGeneratorDesc();
+
+ private:
+  cnnlRandGenerator_t mlu_generator = nullptr;
+};
+
+class MLUCnnlReduceDesc {
+ public:
+  MLUCnnlReduceDesc(const MLUCnnlReduceDesc& desc) = delete;
+  MLUCnnlReduceDesc& operator=(const MLUCnnlReduceDesc& desc) = delete;
+
+  MLUCnnlReduceDesc(const std::vector<int>& axis_vec,
+                    const cnnlReduceOp_t reduce_op,
+                    const cnnlDataType_t data_type,
+                    const cnnlNanPropagation_t nan_propagation,
+                    const cnnlReduceIndices_t reduce_indices,
+                    const cnnlIndicesType_t indices_type);
+
+  const cnnlReduceDescriptor_t get() const;
+
+  ~MLUCnnlReduceDesc();
+
+ private:
+  cnnlReduceDescriptor_t reduction_desc_ = nullptr;
+};
+
+class MLUCnnlOpTensorDesc {
+ public:
+  MLUCnnlOpTensorDesc(const MLUCnnlOpTensorDesc& desc) = delete;
+  void operator=(const MLUCnnlOpTensorDesc&) = delete;
+
+  MLUCnnlOpTensorDesc(cnnlOpTensorDesc_t op_tensor_op,
+                      cnnlDataType_t op_tensor_comp_type,
+                      cnnlNanPropagation_t op_tensor_nan_opt);
+
+  const cnnlOpTensorDescriptor_t get() const;
+
+  ~MLUCnnlOpTensorDesc();
+
+ private:
+  cnnlOpTensorDescriptor_t op_tensor_desc_ = nullptr;
+};
+
+class MLUCnnlNMSDesc {
+ public:
+  MLUCnnlNMSDesc(const MLUCnnlNMSDesc& desc) = delete;
+  MLUCnnlNMSDesc& operator=(const MLUCnnlNMSDesc& desc) = delete;
+
+  MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode, const float iou_threshold,
+                 const int max_output_size, const float confidence_threshold,
+                 const int input_layout);
+
+  const cnnlNmsDescriptor_t get() const;
+
+  ~MLUCnnlNMSDesc();
+
+ private:
+  cnnlNmsDescriptor_t nms_desc_ = nullptr;
+};
+
+class MLUCnnlConvolutionDesc {
+ public:
+  MLUCnnlConvolutionDesc(const int dims, const int pad[], const int stride[],
+                         const int dilation[], const int group_count,
+                         const cnnlDataType_t tensor_dtype);
+
+  MLUCnnlConvolutionDesc(const int dims, const int64_t pad[],
+                         const int64_t stride[], const int64_t dilation[],
+                         const int group_count,
+                         const cnnlDataType_t tensor_dtype);
+
+  MLUCnnlConvolutionDesc(const MLUCnnlConvolutionDesc& desc) = delete;
+
+  MLUCnnlConvolutionDesc& operator=(const MLUCnnlConvolutionDesc& desc) =
+      delete;
+
+  const cnnlConvolutionDescriptor_t get() const;
+
+  ~MLUCnnlConvolutionDesc();
+
+ private:
+  cnnlConvolutionDescriptor_t conv_desc_ = nullptr;
+};
+
+class MLUCnnlBatchSpaceDesc {
+ public:
+  MLUCnnlBatchSpaceDesc(uint32_t block_shape[], uint32_t paddings[],
+                        const uint32_t block_shape_size,
+                        const uint32_t paddings_size);
+
+  void getBatch2spaceNdextraInputSize(const ExecutionContext& ctx,
+                                      const cnnlTensorDescriptor_t input_desc);
+
+  void getSpace2batchNdextraInputSize(const ExecutionContext& ctx,
+                                      const cnnlTensorDescriptor_t input_desc);
+
+  void initSpace2batchNdExtraInput(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t input_desc,
+                                   void* extra_host_input);
+
+  void initBatch2spaceNdExtraInput(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t input_desc,
+                                   void* extra_host_input);
+
+  const cnnlSpaceBatchNdDescriptor_t get() const;
+
+  size_t getExtraInputSize() const;
+
+  ~MLUCnnlBatchSpaceDesc();
+
+ private:
+  cnnlSpaceBatchNdDescriptor_t op_desc_ = nullptr;
+  size_t extra_input_size_;
+};
+
+class MLUCnnlTrigonDesc {
+ public:
+  explicit MLUCnnlTrigonDesc(
+      const cnnlTrigonFunctionMode_t trigon_function_mode);
+
+  const cnnlTrigonDescriptor_t get() const;
+
+  ~MLUCnnlTrigonDesc();
+
+ private:
+  cnnlTrigonDescriptor_t trigon_desc_ = nullptr;
+};
+
 class MLUCnnl {
  public:
-  static void Active(const platform::MLUDeviceContext& ctx,
+  static void Active(const ExecutionContext& ctx,
                      cnnlActivationDescriptor_t active_desc,
                      const cnnlTensorDescriptor_t input_desc, const void* input,
                      const cnnlTensorDescriptor_t output_desc, void* output);
 
-  static void ActiveGrad(const platform::MLUDeviceContext& ctx,
-                         cnnlActivationDescriptor_t active_desc,
-                         const void* alpha, const void* beta,
-                         const cnnlTensorDescriptor_t y_desc, const void* y,
-                         const cnnlTensorDescriptor_t diff_y_desc,
-                         const void* diff_y,
-                         const cnnlTensorDescriptor_t x_desc, const void* x,
-                         const cnnlTensorDescriptor_t diff_x_desc,
-                         void* diff_x);
+  static void ActiveGrad(
+      const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc,
+      const void* alpha, const void* beta, const cnnlTensorDescriptor_t y_desc,
+      const void* y, const cnnlTensorDescriptor_t diff_y_desc,
+      const void* diff_y, const cnnlTensorDescriptor_t x_desc, const void* x,
+      const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
+
+  static void Concat(const ExecutionContext& ctx, const int pack_num,
+                     const int axis, const cnnlTensorDescriptor_t inputs_desc[],
+                     const void* const inputs[],
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type,
+                   const cnnlTensorDescriptor_t input_desc, const void* input,
+                   const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Div(const ExecutionContext& ctx,
+                  cnnlComputationPreference_t prefer,
+                  const cnnlTensorDescriptor_t in0_desc, const void* in0,
+                  const cnnlTensorDescriptor_t in1_desc, const void* in1,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Fill(const ExecutionContext& ctx, float value,
+                   const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void LRN(const ExecutionContext& ctx, const int local_size,
+                  const double alpha, const double beta, const double k,
+                  const cnnlTensorDescriptor_t input_quant_desc,
+                  const void* input_quant,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void QuantifyOffline(const ExecutionContext& context,
+                              cnnlQuantizeMode_t mode,
+                              const cnnlTensorDescriptor_t input_desc,
+                              const void* input,
+                              const cnnlTensorDescriptor_t ouput_desc,
+                              void* output);
+
+  static void QuantifyOnline(const ExecutionContext& context,
+                             const int bitwidth,
+                             const cnnlTensorDescriptor_t input_desc,
+                             const void* input, const bool compute_scale,
+                             void* position, void* scale,
+                             const cnnlTensorDescriptor_t ouput_desc,
+                             void* output);
+
+  static void SGD(const ExecutionContext& context,
+                  const cnnlTensorDescriptor_t grad_desc, const void* grad,
+                  const void* lr, const cnnlTensorDescriptor_t var_desc,
+                  void* var);
+
+  static void ApplyAdaGrad(const ExecutionContext& ctx,
+                           const cnnlTensorDescriptor_t grad_desc,
+                           const void* grad,
+                           const cnnlTensorDescriptor_t accum_desc, void* accum,
+                           const cnnlTensorDescriptor_t var_desc, void* var,
+                           const void* lr, const bool update_slots);
+
+  static void ApplyRMSProp(const ExecutionContext& context,
+                           const cnnlTensorDescriptor_t grad_desc,
+                           const void* grad, const void* lr, const void* rho,
+                           const void* momentum, const void* epsilon,
+                           const cnnlTensorDescriptor_t var_desc, void* var,
+                           const cnnlTensorDescriptor_t ms_desc, void* ms,
+                           const cnnlTensorDescriptor_t mom_desc, void* mom);
+
+  static void ApplyCenterRMSProp(
+      const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
+      const void* grad, const void* lr, const void* rho, const void* momentum,
+      const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var,
+      const cnnlTensorDescriptor_t mg_desc, void* mg,
+      const cnnlTensorDescriptor_t ms_desc, void* ms,
+      const cnnlTensorDescriptor_t mom_desc, void* mom);
+
+  static void ApplyAdam(const ExecutionContext& ctx,
+                        const cnnlTensorDescriptor_t grad_desc,
+                        const void* grad, const void* lr, const void* beta1,
+                        const void* beta2, const void* beta1_power,
+                        const void* beta2_power, const void* epsilon,
+                        const bool use_nesterov,
+                        const cnnlTensorDescriptor_t var_desc, void* var,
+                        const cnnlTensorDescriptor_t m_desc, void* m,
+                        const cnnlTensorDescriptor_t v_desc, void* v);
+
+  static void ApplyAdaMax(const ExecutionContext& ctx,
+                          const cnnlTensorDescriptor_t grad_desc,
+                          const cnnlTensorDescriptor_t var_desc, void* var,
+                          const cnnlTensorDescriptor_t m_desc, void* m,
+                          const cnnlTensorDescriptor_t v_desc, void* v,
+                          const void* diff, const void* lr, const void* beta1,
+                          const void* beta2, const void* beta1_power,
+                          const void* epsilon);
+
+  static void ApplyMomentum(const ExecutionContext& ctx,
+                            const cnnlTensorDescriptor_t grad_desc,
+                            const void* grad, const bool use_nesterov,
+                            const void* lr, const void* momentum, void* var,
+                            void* accum);
+
+  static void ApplyKerasMomentum(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t grad_desc,
+                                 const void* grad, const bool use_nesterov,
+                                 const void* lr, const void* momentum,
+                                 void* var, void* accum);
+
+  static void ApplyAdadelta(const ExecutionContext& ctx,
+                            const cnnlTensorDescriptor_t grad_desc,
+                            const void* diff, const void* lr, const void* rho,
+                            const void* epsilon, void* var, void* accum,
+                            void* accum_update);
+
+  static void SparseSoftmaxXentWithLogits(
+      const ExecutionContext& ctx, cnnlSoftmaxMode_t mode,
+      const cnnlTensorDescriptor_t x_desc, const void* input,
+      const cnnlTensorDescriptor_t label_desc, const void* label,
+      const cnnlTensorDescriptor_t y_desc, void* output,
+      const cnnlTensorDescriptor_t diff_y_desc, void* back_out);
+
+  static void RandomUniform(const ExecutionContext& ctx, const int num,
+                            const cnnlDataType_t data_type,
+                            const cnnlRandGenerator_t mlu_generator,
+                            void* output);
+
+  static void Cumsum(const ExecutionContext& ctx, const int axis,
+                     const bool exclusive, const bool reverse,
+                     const cnnlTensorDescriptor_t input_desc, const void* input,
+                     const cnnlTensorDescriptor_t ouput_desc, void* output);
+
+  static void BroadcastTo(const ExecutionContext& ctx,
+                          const cnnlTensorDescriptor_t input_desc,
+                          const void* input,
+                          const cnnlTensorDescriptor_t output_desc,
+                          void* output);
+
+  static void GatherFunctor(
+      const ExecutionContext& ctx, const int axis, const int batch_dims,
+      const cnnlTensorDescriptor_t params_desc, const void* params,
+      const cnnlTensorDescriptor_t indices_desc, const void* indices,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void ScatterFunctor(
+      const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc,
+      const void* params, const cnnlTensorDescriptor_t updates_desc,
+      const void* updates, const cnnlTensorDescriptor_t indices_desc,
+      const void* indices, const cnnlScatterRefMode_t mode);
+
+  static void Range(const ExecutionContext& ctx, const void* start,
+                    const void* end, const void* step,
+                    const cnnlDataType_t output_dtype, void* output);
+
+  static void Round(const ExecutionContext& ctx,
+                    const cnnlTensorDescriptor_t input_desc, const void* input,
+                    const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void TopK(const ExecutionContext& ctx, const int k, const int dim,
+                   const bool largest, const bool sorted,
+                   const cnnlTensorDescriptor_t input_desc, const void* input,
+                   const cnnlTensorDescriptor_t values_output_desc,
+                   void* values_out,
+                   const cnnlTensorDescriptor_t indices_output_desc,
+                   void* indices_out);
+
+  static void StridedSlice(const ExecutionContext& ctx, const int begin[],
+                           const int end[], const int strides[],
+                           const cnnlTensorDescriptor_t input_desc,
+                           const void* input,
+                           const cnnlTensorDescriptor_t output_desc,
+                           void* output);
+
+  static void Split(const ExecutionContext& ctx, int split_num, int axis,
+                    const cnnlTensorDescriptor_t input_desc,
+                    const void* input_ptr,
+                    const cnnlTensorDescriptor_t output_descs[],
+                    void* output_ptrs[]);
+
+  static void Scale(const ExecutionContext& ctx, const int axis,
+                    const cnnlTensorDescriptor_t input_desc, const void* input,
+                    const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
+                    const cnnlTensorDescriptor_t beta_desc, const void* beta,
+                    const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void AddN(const ExecutionContext& ctx, uint32_t input_num,
+                   const cnnlTensorDescriptor_t inputs_desc[],
+                   const void* inputs[],
+                   const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Log(const ExecutionContext& ctx,
+                  cnnlComputationPreference_t prefer,
+                  const cnnlTensorDescriptor_t input_desc, const void* input,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void StridedSliceGrad(const ExecutionContext& ctx, const int begin[],
+                               const int end[], const int strides[],
+                               const cnnlTensorDescriptor_t input_desc,
+                               const void* input,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output);
+
+  static void Logic(const ExecutionContext& ctx,
+                    const MLULogicMethod log_method,
+                    const cnnlTensorDescriptor_t input1_desc,
+                    const void* input1,
+                    const cnnlTensorDescriptor_t input2_desc,
+                    const void* input2, const cnnlTensorDescriptor_t ouput_desc,
+                    void* output);
+
+  static void Select(const ExecutionContext& ctx,
+                     const cnnlTensorDescriptor_t then_desc, const void* p_then,
+                     const cnnlTensorDescriptor_t else_desc, const void* p_else,
+                     const cnnlTensorDescriptor_t output_desc, void* output,
+                     const bool* condition, const int condition_size);
+
+  static void AssignAdd(const ExecutionContext& ctx, const void* alpha,
+                        const void* beta,
+                        const cnnlTensorDescriptor_t update_desc,
+                        const void* update,
+                        const cnnlTensorDescriptor_t param_desc, void* param);
+
+  static void AssignSub(const ExecutionContext& ctx, const void* alpha,
+                        const void* beta,
+                        const cnnlTensorDescriptor_t update_desc,
+                        const void* update,
+                        const cnnlTensorDescriptor_t param_desc, void* param);
+
+  static void Assign(const ExecutionContext& ctx,
+                     const cnnlTensorDescriptor_t update_desc,
+                     const void* update,
+                     const cnnlTensorDescriptor_t param_desc, void* param);
+
+  static void GatherNd(const ExecutionContext& ctx,
+                       const cnnlTensorDescriptor_t params_desc,
+                       const void* params,
+                       const cnnlTensorDescriptor_t indices_desc,
+                       const void* indices,
+                       const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void BatchToSpace(const ExecutionContext& ctx,
+                           const cnnlTensorDescriptor_t input_desc,
+                           const void* input,
+                           const cnnlTensorDescriptor_t output_desc,
+                           void* output, const cnnlSpaceBatchParam_t param);
+
+  static void BatchToSpaceNd(const ExecutionContext& ctx,
+                             const cnnlTensorDescriptor_t input_desc,
+                             const void* input,
+                             cnnlSpaceBatchNdDescriptor_t param,
+                             void* extra_device_input, size_t extra_input_size,
+                             const cnnlTensorDescriptor_t output_desc,
+                             void* output);
+
+  static void PoolingForward(
+      const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
+      const std::vector<int64_t>& output_shape,
+      cnnlPoolingDescriptor_t pooling_desc, const void* alpha,
+      const cnnlTensorDescriptor_t input_desc, const void* input,
+      const void* beta, const void* extra_input_ptr,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Pool3D(const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
+                     const std::vector<int64_t>& output_shape,
+                     cnnlPoolingDescriptor_t pooling_desc, const void* alpha,
+                     const cnnlTensorDescriptor_t input_desc, const void* input,
+                     const void* beta, const cnnlTensorDescriptor_t output_desc,
+                     void* output);
+
+  static void Pad(const ExecutionContext& ctx,
+                  const cnnlTensorDescriptor_t input_desc, const void* input,
+                  const void* paddings, const void* padding_value,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Matmul(const ExecutionContext& ctx, const bool transpose_a,
+                     const bool transpose_b,
+                     const cnnlTensorDescriptor_t in0_desc, const void* in0,
+                     const cnnlTensorDescriptor_t in1_desc, const void* in1,
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void BatchMatmul(
+      const ExecutionContext& ctx, const bool transpose_a,
+      const bool transpose_b, const cnnlTensorDescriptor_t in0_desc,
+      const void* in0, const cnnlTensorDescriptor_t in1_desc, const void* in1,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void OpTensor(const ExecutionContext& ctx,
+                       const cnnlOpTensorDescriptor_t op_tensor_desc,
+                       const cnnlTensorDescriptor_t a_desc, const void* a,
+                       const cnnlTensorDescriptor_t b_desc, const void* b,
+                       const cnnlTensorDescriptor_t output_desc, void* output,
+                       const cnnlDataType_t dtype);
+
+  static void BiasAddGrad(const ExecutionContext& ctx, const int axis,
+                          const cnnlTensorDescriptor_t out_backprop_desc,
+                          const void* out_backprop,
+                          const cnnlTensorDescriptor_t output_desc,
+                          void* output);
+
+  static void OneHot(const ExecutionContext& ctx,
+                     const cnnlTensorDescriptor_t desc_indices,
+                     const void* indices, const int depth, const void* on_value,
+                     const void* off_value, const int axis,
+                     cnnlDataType_t output_data_type, void* output);
+
+  static void NonMaxSuppression(const ExecutionContext& ctx,
+                                const cnnlNmsDescriptor_t nms_desc,
+                                const cnnlTensorDescriptor_t boxes_desc,
+                                const void* boxes,
+                                const cnnlTensorDescriptor_t confidence_desc,
+                                const void* confidence,
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output, void* output_size);
+
+  static void SoftmaxCrossEntropyWithLogits(
+      const ExecutionContext& ctx, cnnlSoftmaxMode_t mode,
+      cnnlComputationPreference_t prefer,
+      const cnnlTensorDescriptor_t input_desc, const void* logits_in,
+      const cnnlTensorDescriptor_t label_desc, const void* labels_in,
+      const cnnlTensorDescriptor_t loss_out_desc, void* loss_out,
+      const cnnlTensorDescriptor_t back_out_desc, void* back_out);
+
+  static void SoftmaxForward(const ExecutionContext& ctx,
+                             cnnlSoftmaxAlgorithm_t algorithm,
+                             cnnlSoftmaxMode_t mode, const void* alpha,
+                             const cnnlTensorDescriptor_t input_desc,
+                             const void* input, const void* beta,
+                             const cnnlTensorDescriptor_t output_desc,
+                             void* output);
+
+  static void Softplus(const ExecutionContext& ctx,
+                       const cnnlTensorDescriptor_t features_desc,
+                       const void* features,
+                       const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void SoftplusGrad(const ExecutionContext& ctx,
+                           const cnnlTensorDescriptor_t gradients_desc,
+                           const void* gradients,
+                           const cnnlTensorDescriptor_t features_desc,
+                           const void* features,
+                           const cnnlTensorDescriptor_t output_desc,
+                           void* output);
+
+  static void RsqrtGrad(const ExecutionContext& ctx,
+                        const cnnlTensorDescriptor_t data_desc, const void* y,
+                        const void* diff_y, void* output);
+
+  static void SqrtGrad(const ExecutionContext& ctx,
+                       const cnnlTensorDescriptor_t data_desc, const void* y,
+                       const void* diff_y, void* output);
+
+  static void ConvolutionForward(
+      const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc_,
+      const void* alpha, const void* beta,
+      const cnnlTensorDescriptor_t bias_desc, const void* bias_ptr,
+      const cnnlTensorDescriptor_t input_desc, const void* input,
+      const cnnlTensorDescriptor_t filtet_desc, const void* filter,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void FusedConvBNQuantify(
+      const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc,
+      const void* epsilon_ptr, const int fused_ops_number,
+      const cnnlDataType_t tensor_dtype, const int input_position,
+      const float input_scale, const int filter_position,
+      const float filter_scale, const cnnlTensorDescriptor_t scale_desc,
+      const void* scale_ptr, const cnnlTensorDescriptor_t offset_desc,
+      const void* offset_ptr, const cnnlTensorDescriptor_t mean_desc,
+      const void* mean_ptr, const cnnlTensorDescriptor_t variance_desc,
+      const void* variance_ptr, const cnnlTensorDescriptor_t input_desc,
+      const void* input, const cnnlTensorDescriptor_t filtet_desc,
+      const void* filter, const cnnlTensorDescriptor_t output_desc,
+      void* output);
+
+  static void Tile(const ExecutionContext& ctx,
+                   const cnnlTensorDescriptor_t input_desc, const void* input,
+                   const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void UnsortedSegmentSum(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t data_desc,
+                                 const void* data,
+                                 const cnnlTensorDescriptor_t ids_desc,
+                                 const int* segment_ids,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output);
+
+  static void Reduce(const ExecutionContext& ctx, const bool need_workspace,
+                     const cnnlReduceDescriptor_t reduction_desc,
+                     const void* alpha, const cnnlTensorDescriptor_t input_desc,
+                     const void* input, const size_t indices_size,
+                     void* indices, const void* beta,
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void FloorDiv(const ExecutionContext& ctx,
+                       cnnlComputationPreference_t prefer,
+                       const cnnlTensorDescriptor_t input1_desc,
+                       const void* input1,
+                       const cnnlTensorDescriptor_t input2_desc,
+                       const void* input2,
+                       const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void FloorMod(const ExecutionContext& ctx,
+                       const cnnlTensorDescriptor_t input1_desc,
+                       const void* input1,
+                       const cnnlTensorDescriptor_t input2_desc,
+                       const void* input2,
+                       const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Maximum(const ExecutionContext& ctx,
+                      const cnnlTensorDescriptor_t input1_desc,
+                      const void* input1,
+                      const cnnlTensorDescriptor_t input2_desc,
+                      const void* input2,
+                      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Minimum(const ExecutionContext& ctx,
+                      const cnnlTensorDescriptor_t input1_desc,
+                      const void* input1,
+                      const cnnlTensorDescriptor_t input2_desc,
+                      const void* input2,
+                      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void PowR(const ExecutionContext& ctx,
+                   cnnlComputationPreference_t prefer,
+                   const cnnlTensorDescriptor_t input1_desc, const void* input1,
+                   const cnnlTensorDescriptor_t input2_desc, const void* input2,
+                   const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void DivNoNan(const ExecutionContext& ctx,
+                       cnnlComputationPreference_t prefer,
+                       const cnnlTensorDescriptor_t input1_desc,
+                       const void* input1,
+                       const cnnlTensorDescriptor_t input2_desc,
+                       const void* input2,
+                       const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void SquaredDifference(const ExecutionContext& ctx,
+                                const cnnlTensorDescriptor_t input1_desc,
+                                const void* input1,
+                                const cnnlTensorDescriptor_t input2_desc,
+                                const void* input2,
+                                const cnnlTensorDescriptor_t output_desc,
+                                void* output);
+
+  static void L2Loss(const ExecutionContext& ctx,
+                     const cnnlTensorDescriptor_t input_desc, const void* input,
+                     void* output);
+
+  static void Abs(const ExecutionContext& ctx,
+                  const cnnlTensorDescriptor_t input_desc, const void* input,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Neg(const ExecutionContext& ctx,
+                  const cnnlTensorDescriptor_t input_desc, const void* input,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Floor(const ExecutionContext& ctx,
+                    const cnnlTensorDescriptor_t input_desc, const void* input,
+                    const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Ceil(const ExecutionContext& ctx,
+                   const cnnlTensorDescriptor_t input_desc, const void* input,
+                   const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void IsNan(const ExecutionContext& ctx,
+                    const cnnlTensorDescriptor_t input_desc, const void* input,
+                    const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Square(const ExecutionContext& ctx,
+                     const cnnlTensorDescriptor_t input_desc, const void* input,
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Sqrt(const ExecutionContext& ctx,
+                   cnnlComputationPreference_t prefer,
+                   const cnnlTensorDescriptor_t input_desc, const void* input,
+                   const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Rsqrt(const ExecutionContext& ctx,
+                    cnnlComputationPreference_t prefer,
+                    const cnnlTensorDescriptor_t input_desc, const void* input,
+                    const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Cos(const ExecutionContext& ctx,
+                  cnnlComputationPreference_t prefer,
+                  const cnnlTensorDescriptor_t input_desc, const void* input,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Sin(const ExecutionContext& ctx,
+                  cnnlComputationPreference_t prefer,
+                  const cnnlTensorDescriptor_t input_desc, const void* input,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void TrigonForward(const ExecutionContext& ctx,
+                            const cnnlTrigonDescriptor_t trigon_desc,
+                            const cnnlTensorDescriptor_t input_desc,
+                            const void* input,
+                            const cnnlTensorDescriptor_t output_desc,
+                            void* output);
+
+  static void Exp(const ExecutionContext& ctx,
+                  cnnlComputationPreference_t prefer,
+                  const cnnlTensorDescriptor_t input_desc, const void* input,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Sign(const ExecutionContext& ctx,
+                   const cnnlTensorDescriptor_t input_desc, const void* input,
+                   const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void IsFinite(const ExecutionContext& ctx,
+                       const cnnlTensorDescriptor_t input_desc,
+                       const void* input,
+                       const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void IsNanInf(const ExecutionContext& ctx,
+                       const cnnlTensorDescriptor_t input_desc,
+                       const void* input, void* output);
+
+  static void Erf(const ExecutionContext& ctx,
+                  cnnlComputationPreference_t prefer,
+                  const cnnlTensorDescriptor_t input_desc, const void* input,
+                  const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void Log1p(const ExecutionContext& ctx,
+                    cnnlComputationPreference_t prefer,
+                    const cnnlTensorDescriptor_t input_desc, const void* input,
+                    const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void LogicalNot(const ExecutionContext& ctx,
+                         const cnnlTensorDescriptor_t input_desc,
+                         const void* input,
+                         const cnnlTensorDescriptor_t output_desc,
+                         void* output);
+
+  static void DynamicStitch(
+      const ExecutionContext& ctx, const cnnlTensorDescriptor_t* indices_desc,
+      const int** indices, const cnnlTensorDescriptor_t* data_desc,
+      const void** data, const int size, int* indices_dims,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void CropAndResize(
+      const ExecutionContext& ctx, const std::string method_name,
+      const float extrapolation_value, const cnnlTensorDescriptor_t image_desc,
+      const void* image, const cnnlTensorDescriptor_t boxes_desc,
+      const void* boxes, const cnnlTensorDescriptor_t box_index_desc,
+      const void* box_index, const cnnlTensorDescriptor_t output_desc,
+      void* output);
+
+  static void CropAndResizeBackwardImage(
+      const ExecutionContext& ctx, const std::string method_name,
+      const cnnlTensorDescriptor_t image_desc, const void* image,
+      const cnnlTensorDescriptor_t boxes_desc, const void* boxes,
+      const cnnlTensorDescriptor_t box_idx_desc, const void* box_idx,
+      const cnnlTensorDescriptor_t grads_image_desc, void* grads_image);
+
+  static void CropAndResizeBackwardBoxes(
+      const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
+      const void* input, const cnnlTensorDescriptor_t image_desc,
+      const void* image, const cnnlTensorDescriptor_t boxes_desc,
+      const void* boxes, const cnnlTensorDescriptor_t box_idx_desc,
+      const void* box_idx, const cnnlTensorDescriptor_t output_desc,
+      void* output);
+
+  static void PoolingBackward(
+      const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc,
+      const void* alpha, const cnnlTensorDescriptor_t y_desc, const void* y,
+      const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y,
+      const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta,
+      const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
+
+  static void PoolingIndex(const ExecutionContext& ctx,
+                           const cnnlPoolingDescriptor_t pooling_desc,
+                           const cnnlTensorDescriptor_t x_desc, const void* x,
+                           const cnnlTensorDescriptor_t y_desc, void* y);
+
+  static void SpaceToBatch(const ExecutionContext& ctx,
+                           const cnnlTensorDescriptor_t input_desc,
+                           const void* input,
+                           const cnnlTensorDescriptor_t output_desc,
+                           void* output, const int64_t block_shape[]);
+
+  static void SpaceToBatchNd(const ExecutionContext& ctx,
+                             const cnnlTensorDescriptor_t input_desc,
+                             const void* input,
+                             cnnlSpaceBatchNdDescriptor_t param,
+                             void* extra_device_input, size_t extra_input_size,
+                             const cnnlTensorDescriptor_t output_desc,
+                             void* output);
+
+  static void Interp(const ExecutionContext& ctx, const cnnlInterpMode_t mode,
+                     const bool align_corners, const bool half_pixel_centers,
+                     const cnnlTensorDescriptor_t input_desc, const void* input,
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void InterpBackward(
+      const ExecutionContext& ctx, const cnnlInterpBackwardMode_t mode,
+      const bool align_corners, const bool half_pixel_centers,
+      const cnnlTensorDescriptor_t input_desc, const void* input,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void QuantizeParam(const ExecutionContext& ctx,
+                            const cnnlQuantizeMode_t mode, const int bitwidth,
+                            const cnnlTensorDescriptor_t input_desc,
+                            const void* input, void* position, void* scale,
+                            void* offset);
+
+  static void QuantizeMatMul(
+      const ExecutionContext& ctx, const bool transpose_a,
+      const bool transpose_b, const cnnlTensorDescriptor_t a_desc,
+      const void* a, const void* a_position, const void* a_scale,
+      const void* a_offset, const cnnlTensorDescriptor_t b_desc, const void* b,
+      const void* b_position, const void* b_scale, const void* b_offset,
+      const cnnlDataType_t quant_type, const cnnlDataType_t data_type,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void QuantizeBatchMatMul(
+      const ExecutionContext& ctx, const bool adj_x, const bool adj_y,
+      const cnnlTensorDescriptor_t a_desc, const void* a,
+      const void* a_position, const void* a_scale, const void* a_offset,
+      const cnnlTensorDescriptor_t b_desc, const void* b,
+      const void* b_position, const void* b_scale, const void* b_offset,
+      const cnnlDataType_t quant_type, const cnnlDataType_t data_type,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void QuantizeBatchMatMulBCast(
+      const ExecutionContext& ctx, const bool adj_x, const bool adj_y,
+      const cnnlTensorDescriptor_t a_desc, const void* a,
+      const void* a_position, const void* a_scale, const void* a_offset,
+      const cnnlTensorDescriptor_t b_desc, const void* b,
+      const void* b_position, const void* b_scale, const void* b_offset,
+      const cnnlDataType_t quant_type, const cnnlDataType_t data_type,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void FusedBatchNorm(
+      const ExecutionContext& ctx, const bool is_training,
+      const cnnlTensorDescriptor_t x_desc, const void* x,
+      const cnnlTensorDescriptor_t scale_desc, const void* scale,
+      const void* offset, const void* estimated_mean,
+      const void* estimated_variance, float epsilon, float momentum,
+      const cnnlTensorDescriptor_t output_desc, void* output, void* batch_mean,
+      void* batch_var, void* saved_mean, void* saved_var);
+
+  static void FusedBatchNormGrad(
+      const ExecutionContext& ctx, const bool is_training,
+      const cnnlTensorDescriptor_t y_backprop_desc, const void* y_backprop,
+      const cnnlTensorDescriptor_t x_desc, const void* x,
+      const cnnlTensorDescriptor_t scale_desc, const void* scale,
+      const void* saved_mean, const void* saved_var, float epsilon,
+      const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop,
+      void* scale_backprop, void* offset_backprop);
+
+  static void Transpose(const ExecutionContext& ctx,
+                        const std::vector<int> perm, const int input_dim,
+                        const cnnlTensorDescriptor_t input_desc,
+                        const void* input,
+                        const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void MatrixBandPart(const ExecutionContext& ctx,
+                             const cnnlTensorDescriptor_t data_desc,
+                             const void* input, const int num_lower,
+                             const int num_upper, void* output);
+
+  static void NumTrue(const ExecutionContext& ctx,
+                      const cnnlTensorDescriptor_t x_desc, const void* x,
+                      Tensor index, uint32_t* num_true);
+
+  static void Where(const ExecutionContext& ctx,
+                    const cnnlTensorDescriptor_t x_desc, const void* x,
+                    const uint32_t* strides, const uint32_t* index,
+                    const cnnlTensorDescriptor_t y_desc, int* y,
+                    const bool as_tuple);
+
+  static void Conv2D(const ExecutionContext& ctx,
+                     const cnnlConvolutionDescriptor_t conv_desc,
+                     const cnnlDataType_t tensor_dtype,
+                     const cnnlDataType_t dt_onchip, const void* input_position,
+                     const void* input_scale, const void* input_offset,
+                     const void* filter_position, const void* filter_scale,
+                     const void* filter_offset,
+                     const cnnlTensorDescriptor_t input_desc, const void* input,
+                     const cnnlTensorDescriptor_t filter_desc,
+                     const void* filter, const cnnlTensorDescriptor_t bias_desc,
+                     const void* bias, const cnnlTensorDescriptor_t output_desc,
+                     void* output);
+
+  static void ConvBackpropInput(
+      const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+      const cnnlTensorDescriptor_t input_desc, const void* filter,
+      const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+      const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop);
+
+  static void QuantizeConvBackpropInput(
+      const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+      const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip,
+      const void* filter_position, const void* filter_scale,
+      const void* filter_offset, const void* out_backprop_position,
+      const void* out_backprop_scale, const void* out_backprop_offset,
+      const cnnlTensorDescriptor_t input_desc, const void* filter,
+      const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+      const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop);
+
+  static void ConvBackpropFilter(
+      const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+      const cnnlTensorDescriptor_t input_desc, const void* input,
+      const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+      const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop);
+
+  static void QuantizeConvBackpropFilter(
+      const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
+      const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip,
+      const void* input_position, const void* input_scale,
+      const void* input_offset, const void* out_backprop_position,
+      const void* out_backprop_scale, const void* out_backprop_offset,
+      const cnnlTensorDescriptor_t input_desc, const void* input,
+      const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
+      const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop);
+
+  static void InTopK(const ExecutionContext& ctx,
+                     const cnnlTensorDescriptor_t predictions_desc,
+                     const void* predictions,
+                     const cnnlTensorDescriptor_t targets_desc,
+                     const void* targets, const cnnlTensorDescriptor_t k_desc,
+                     const void* k, const int k_int,
+                     const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void ScatterNd(const ExecutionContext& ctx,
+                        const cnnlTensorDescriptor_t indices_desc,
+                        const void* indices,
+                        const cnnlTensorDescriptor_t updates_desc,
+                        const void* updates,
+                        const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void BitWise(const ExecutionContext& ctx,
+                      const cnnlBitComputeOp_t optype,
+                      const cnnlTensorDescriptor_t input1_desc,
+                      const void* input1,
+                      const cnnlTensorDescriptor_t input2_desc,
+                      const void* input2,
+                      const cnnlTensorDescriptor_t output_desc, void* output);
+
+  static void QR(const ExecutionContext& ctx,
+                 const cnnlTensorDescriptor_t a_desc, const void* a,
+                 const cnnlTensorDescriptor_t q_desc, void* q,
+                 const cnnlTensorDescriptor_t r_desc, void* r, const bool some);
+
+  static void Reciprocal(const ExecutionContext& ctx,
+                         const cnnlTensorDescriptor_t input_desc,
+                         const void* input,
+                         const cnnlTensorDescriptor_t output_desc,
+                         void* output);
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
new file mode 100644
index 0000000000000..90e513cb1cd07
--- /dev/null
+++ b/paddle/fluid/operators/mode_op.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mode_op.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ModeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode");
+    OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int& dim_size = input_dims.size();
+    int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE_EQ(
+        (axis < dim_size) && (axis >= (-1 * dim_size)), true,
+        paddle::platform::errors::InvalidArgument(
+            "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
+            dim_size, dim_size, axis));
+    PADDLE_ENFORCE_GE(input_dims.size(), 1,
+                      paddle::platform::errors::InvalidArgument(
+                          "input of ModeOp must have >= 1d shape"));
+    if (axis < 0) axis += dim_size;
+    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
+    std::vector<int64_t> dimvec;
+    for (int64_t i = 0; i < axis; i++) {
+      dimvec.emplace_back(input_dims[i]);
+    }
+    if (keepdim) {
+      dimvec.emplace_back(static_cast<int64_t>(1));
+    }
+    for (int64_t i = axis + 1; i < dim_size; i++) {
+      dimvec.emplace_back(input_dims[i]);
+    }
+    framework::DDim dims = framework::make_ddim(dimvec);
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
+                                                "input shape should >= 1d"));
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class ModeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Mode op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddAttr<int>("axis",
+                 "the axis to calculate mode values."
+                 "if not set, will calculate on last axis.")
+        .SetDefault(-1);
+    AddAttr<bool>("keepdim", "Keep the dim that to reduce.").SetDefault(false);
+    AddComment(R"DOC(
+This operator finds the mode of input Tensor. And outputs their values and indices as vectors. 
+)DOC");
+  }
+};
+
+class ModeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument("Input(X) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Indices"), true,
+        platform::errors::InvalidArgument("Input(Indices) should be not null"));
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
+                      platform::errors::InvalidArgument(
+                          "Grad Input(Out) should be not null"));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput(framework::GradVarName("X")), true,
+        platform::errors::InvalidArgument("Grad Output(X) should be not null"));
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+template <typename T>
+class ModeGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("mode_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Output("Indices"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker,
+                  ops::ModeGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ModeGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(mode,
+                       ops::ModeCPUKernel<paddle::platform::CPUPlace, float>,
+                       ops::ModeCPUKernel<paddle::platform::CPUPlace, double>,
+                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int32_t>,
+                       ops::ModeCPUKernel<paddle::platform::CPUPlace, int64_t>);
+
+REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    mode_grad, ops::ModeGradCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, double>,
+    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int32_t>,
+    ops::ModeGradCPUKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu
new file mode 100644
index 0000000000000..b42bdb548216e
--- /dev/null
+++ b/paddle/fluid/operators/mode_op.cu
@@ -0,0 +1,233 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mode_op.h"
+#include "paddle/fluid/operators/top_k_function_cuda.h"
+#include "paddle/fluid/operators/top_k_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+int ComputeBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+template <typename T>
+void getModebySort(const platform::CUDADeviceContext& ctx,
+                   const framework::Tensor* input_tensor,
+                   const int64_t num_cols, const int64_t num_rows,
+                   T* out_tensor, int64_t* indices_tensor) {
+  framework::Tensor input_tmp;
+  framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp);
+  T* input_tmp_data = input_tmp.mutable_data<T>(ctx.GetPlace());
+  input_tmp.Resize(framework::make_ddim({num_rows, num_cols}));
+  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
+  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
+
+  for (int64_t i = 0; i < num_rows; ++i) {
+    T* begin = input_tmp_data + num_cols * i;
+    T* end = input_tmp_data + num_cols * (i + 1);
+    thrust::device_vector<int64_t> indices_data(num_cols);
+    thrust::sequence(thrust::device, indices_data.begin(),
+                     indices_data.begin() + num_cols);
+    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
+    int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
+                                           begin + 1, 0, thrust::plus<int>(),
+                                           thrust::not_equal_to<T>());
+    thrust::device_vector<T> keys_data(unique);
+    thrust::device_vector<int64_t> cnts_data(unique);
+    thrust::reduce_by_key(thrust::device, begin, end,
+                          thrust::constant_iterator<int>(1), keys_data.begin(),
+                          cnts_data.begin());
+    auto it = thrust::max_element(thrust::device, cnts_data.begin(),
+                                  cnts_data.begin() + unique);
+    T mode = keys_data[it - cnts_data.begin()];
+    int64_t counts = cnts_data[it - cnts_data.begin()];
+    auto pos = thrust::find(thrust::device, begin, end, mode);
+    int64_t index = indices_data[pos - begin + counts - 1];
+    out_tensor_ptr[i] = static_cast<T>(mode);
+    indices_tensor_ptr[i] = static_cast<int64_t>(index);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class ModeOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument(
+            "It must use CUDAPlace, you must check your device set."));
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    bool keepdim = static_cast<bool>(ctx.Attr<bool>("keepdim"));
+
+    // get the input dims
+    const auto& in_dims = input->dims();
+    // calcluate the real axis
+    if (axis < 0) axis += in_dims.size();
+
+    auto out_dims = output->dims();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    if (axis == in_dims.size() - 1) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      const auto& dev_ctx = ctx.cuda_device_context();
+      getModebySort<T>(dev_ctx, input, input_width, input_height, output_data,
+                       indices_data);
+    } else {
+      std::vector<int> trans_axis;
+      for (int i = 0; i < axis; i++) {
+        trans_axis.emplace_back(i);
+      }
+      trans_axis.emplace_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans_axis.emplace_back(i);
+      }
+      trans_axis.emplace_back(axis);
+
+      if (!keepdim) {
+        std::vector<int> tmp_out_shape;
+        for (int i = 0; i < axis; i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        tmp_out_shape.emplace_back(1);
+        for (int i = axis + 1; i < in_dims.size(); i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        framework::DDim tmp_out_dim = framework::make_ddim(tmp_out_shape);
+        output->Resize(tmp_out_dim);
+        indices->Resize(tmp_out_dim);
+      }
+
+      framework::DDim trans_shape(in_dims);
+      framework::DDim trans_out_shape(in_dims);
+      for (int i = 0; i < trans_axis.size(); i++) {
+        trans_shape[i] = in_dims[trans_axis[i]];
+        trans_out_shape[i] = in_dims[trans_axis[i]];
+      }
+      trans_out_shape[in_dims.size() - 1] = 1;
+
+      // second step, tranpose the input
+      framework::Tensor trans_input;
+      trans_input.mutable_data<T>(trans_shape, ctx.GetPlace());
+      int ndims = trans_axis.size();
+      const auto& dev_ctx = ctx.cuda_device_context();
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *input,
+                                                   &trans_input, trans_axis);
+      framework::Tensor trans_ind;
+      int64_t* trans_ind_data =
+          trans_ind.mutable_data<int64_t>(trans_out_shape, ctx.GetPlace());
+      framework::Tensor trans_out;
+      T* trans_out_data =
+          trans_out.mutable_data<T>(trans_out_shape, ctx.GetPlace());
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+      const int64_t input_width = trans_shape[trans_shape.size() - 1];
+      getModebySort<T>(dev_ctx, &trans_input, input_width, input_height,
+                       trans_out_data, trans_ind_data);
+      // last step, tranpose back the indices and output
+      TransCompute<platform::CUDADeviceContext, int64_t>(
+          ndims, dev_ctx, trans_ind, indices, trans_axis);
+      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, trans_out,
+                                                   output, trans_axis);
+      if (!keepdim) {
+        output->Resize(out_dims);
+        indices->Resize(out_dims);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ModeOpGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::InvalidArgument(
+            "It must use CUDAPlace, you must check your device set."));
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<framework::Tensor>("Indices");
+    auto* x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int axis = context.Attr<int>("axis");
+
+    const auto& in_dims = x->dims();
+    auto out_dims = indices->dims();
+
+    if (axis < 0) axis += in_dims.size();
+    // allocate the cuda memory for the x_grad
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const int64_t* indices_data = indices->data<int64_t>();
+
+    int pre, n, post;
+    GetDims(in_dims, axis, &pre, &n, &post);
+
+    // calcluate the block and grid num
+    auto& dev_ctx = context.cuda_device_context();
+    int block_size = ComputeBlockSize(post);
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
+    int grid_size = std::min(max_blocks, pre);
+    AssignGradWithAxis<T><<<grid_size, block_size, 64 * 4, dev_ctx.stream()>>>(
+        out_grad_data, indices_data, x_grad_data, pre, post, n, 1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    mode, ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ModeOpCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    mode_grad,
+    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ModeOpGradCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h
new file mode 100644
index 0000000000000..dac0ff9279c09
--- /dev/null
+++ b/paddle/fluid/operators/mode_op.h
@@ -0,0 +1,317 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Type>
+static void getMode(Type input_height, Type input_width, int input_dim,
+                    const framework::Tensor* input, T* t_out, Type* t_indices) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    std::vector<std::pair<T, Type>> col_vec;
+    col_vec.reserve(input_width);
+    if (input_dim == 1) {
+      auto e_input = framework::EigenVector<T>::Flatten(*input);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(j), j));
+      }
+    } else {
+      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      for (Type j = 0; j < input_width; ++j) {
+        col_vec.emplace_back(std::pair<T, Type>(e_input(i, j), j));
+      }
+    }
+    std::sort(col_vec.begin(), col_vec.end(),
+              [](const std::pair<T, Type>& l, const std::pair<T, Type>& r) {
+                return (!std::isnan(static_cast<double>(l.first)) &&
+                        std::isnan(static_cast<double>(r.first))) ||
+                       (l.first < r.first);
+              });
+    T mode = 0;
+    int64_t indice = 0;
+    int64_t cur_freq = 0;
+    int64_t max_freq = 0;
+    for (int64_t i = 0; i < input_width; ++i) {
+      ++cur_freq;
+      if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) {
+        if (cur_freq > max_freq) {
+          max_freq = cur_freq;
+          mode = col_vec[i].first;
+          indice = col_vec[i].second;
+        }
+        cur_freq = 0;
+      }
+    }
+    t_out[i] = mode;
+    t_indices[i] = indice;
+  }
+}
+
+template <typename T, typename Type>
+static void ModeAssign(const Type& input_height, const Type& input_width,
+                       const int& input_dim, const framework::Tensor* input,
+                       const framework::Tensor* indices, T* output_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (Type i = 0; i < input_height; ++i) {
+    if (input_dim == 1) {
+      auto e_input = framework::EigenVector<T>::Flatten(*input);
+      auto e_indices = framework::EigenVector<Type>::Flatten(*indices);
+      output_data[i * input_width + e_indices(0)] = e_input(0);
+    } else {
+      auto e_input = framework::EigenMatrix<T>::Reshape(*input, input_dim - 1);
+      auto e_indices =
+          framework::EigenMatrix<Type>::Reshape(*indices, input_dim - 1);
+      output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0);
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class ModeCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<framework::Tensor>("X");
+    auto* output = context.Output<framework::Tensor>("Out");
+    auto* indices = context.Output<framework::Tensor>("Indices");
+    const auto& in_dims = input->dims();
+    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
+
+    // axis < 0, cacluate the real axis
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    if (axis < 0) axis += in_dims.size();
+
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(context.GetPlace());
+    auto out_dims = output->dims();
+    // if axis is not the last dim, transpose it to the last dim, do the
+    // calculation,
+    // then tranpose it back to orginal axis.
+    if (axis == in_dims.size() - 1) {
+      const int64_t& input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t& input_width = in_dims[in_dims.size() - 1];
+      getMode<T, int64_t>(input_height, input_width, in_dims.size(), input,
+                          output_data, indices_data);
+    } else {
+      std::vector<int> trans_axis;
+      for (int i = 0; i < axis; i++) {
+        trans_axis.emplace_back(i);
+      }
+      trans_axis.push_back(in_dims.size() - 1);
+      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
+        trans_axis.emplace_back(i);
+      }
+      trans_axis.emplace_back(axis);
+
+      if (!keepdim) {
+        std::vector<int> tmp_out_shape;
+        for (int i = 0; i < axis; i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        tmp_out_shape.emplace_back(1);
+        for (int i = axis + 1; i < in_dims.size(); i++) {
+          tmp_out_shape.emplace_back(in_dims[i]);
+        }
+        framework::DDim tmp_out_dim = framework::make_ddim(tmp_out_shape);
+        output->Resize(tmp_out_dim);
+        indices->Resize(tmp_out_dim);
+      }
+
+      // get the trans input_dims, out_dims
+      framework::DDim trans_shape(in_dims);
+      framework::DDim trans_out_shape(in_dims);
+
+      for (size_t i = 0; i < trans_axis.size(); i++) {
+        trans_shape[i] = in_dims[trans_axis[i]];
+        trans_out_shape[i] = in_dims[trans_axis[i]];
+      }
+      trans_out_shape[in_dims.size() - 1] = 1;
+
+      framework::Tensor trans_input;
+      trans_input.mutable_data<T>(trans_shape, context.GetPlace());
+      int ndims = trans_axis.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      // transpose the input value
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, *input,
+                                                  &trans_input, trans_axis);
+
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_shape, 0, trans_shape.size() - 1));
+      const int64_t input_width = trans_shape[trans_shape.size() - 1];
+      framework::Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_out_shape, context.GetPlace());
+      framework::Tensor tmp_indices;
+      auto* t_ind = tmp_indices.mutable_data<int64_t>(trans_out_shape,
+                                                      context.GetPlace());
+
+      getMode<T, int64_t>(input_height, input_width, in_dims.size(),
+                          &trans_input, t_out, t_ind);
+      // transpose back
+      TransCompute<platform::CPUDeviceContext, int64_t>(
+          ndims, dev_context, tmp_indices, indices, trans_axis);
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  output, trans_axis);
+      if (!keepdim) {
+        output->Resize(out_dims);
+        indices->Resize(out_dims);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ModeGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* indices = context.Input<framework::Tensor>("Indices");
+    auto* x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    bool keepdim = static_cast<bool>(context.Attr<bool>("keepdim"));
+
+    auto in_dims = x->dims();
+    auto out_dims = indices->dims();
+
+    // axis < 0, get the real axis
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+    if (!keepdim) {
+      std::vector<int> tmp_out_shape;
+      for (int i = 0; i < axis; i++) {
+        tmp_out_shape.emplace_back(out_dims[i]);
+      }
+      tmp_out_shape.emplace_back(1);
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        tmp_out_shape.emplace_back(out_dims[i - 1]);
+      }
+      out_dims = framework::make_ddim(tmp_out_shape);
+    }
+    T* x_grad_data = x_grad->mutable_data<T>(context.GetPlace());
+    if (axis == in_dims.size() - 1) {
+      // allocate the memory for the input_grad
+      // assign the out_grad to input_grad directly
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const int64_t input_width = in_dims[in_dims.size() - 1];
+
+      // init the output grad with 0, because some input elements has no grad
+      memset(x_grad_data, 0, x_grad->numel() * sizeof(T));
+      // Assign the output_grad to input_grad
+      if (keepdim) {
+        ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices,
+                   x_grad_data);
+      } else {
+        auto& dev_context =
+            context.template device_context<platform::CPUDeviceContext>();
+        framework::Tensor out_grad_tmp;
+        framework::Tensor indices_tmp;
+        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
+        indices_tmp.mutable_data<int64_t>(indices->dims(),
+                                          dev_context.GetPlace());
+        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
+                              &out_grad_tmp);
+        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
+                              &indices_tmp);
+        out_grad_tmp.Resize(out_dims);
+        indices_tmp.Resize(out_dims);
+        ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp,
+                   &indices_tmp, x_grad_data);
+      }
+    } else {
+      // can not assign grad to input_grad, must do the transpose
+      std::vector<int> trans_axis;
+      for (int i = 0; i < axis; i++) {
+        trans_axis.emplace_back(i);
+      }
+      trans_axis.emplace_back(out_dims.size() - 1);
+      for (int i = axis + 1; i < out_dims.size() - 1; i++) {
+        trans_axis.emplace_back(i);
+      }
+      trans_axis.emplace_back(axis);
+      framework::DDim trans_shape(out_dims);
+      framework::DDim trans_in_shape(in_dims);
+      for (size_t i = 0; i < trans_axis.size(); i++) {
+        trans_shape[i] = out_dims[trans_axis[i]];
+        trans_in_shape[i] = in_dims[trans_axis[i]];
+      }
+      // transpose the out_grad, indices
+      framework::Tensor trans_dO;
+      trans_dO.mutable_data<T>(trans_shape, context.GetPlace());
+      framework::Tensor trans_ind;
+      trans_ind.mutable_data<int64_t>(trans_shape, context.GetPlace());
+      int ndims = trans_axis.size();
+      auto& dev_context =
+          context.template device_context<platform::CPUDeviceContext>();
+
+      if (keepdim) {
+        // Do transpose
+        TransCompute<platform::CPUDeviceContext, T>(
+            ndims, dev_context, *out_grad, &trans_dO, trans_axis);
+        TransCompute<platform::CPUDeviceContext, int64_t>(
+            ndims, dev_context, *indices, &trans_ind, trans_axis);
+      } else {
+        framework::Tensor out_grad_tmp;
+        framework::Tensor indices_tmp;
+        out_grad_tmp.mutable_data<T>(out_grad->dims(), dev_context.GetPlace());
+        indices_tmp.mutable_data<int64_t>(indices->dims(),
+                                          dev_context.GetPlace());
+        framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context,
+                              &out_grad_tmp);
+        framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context,
+                              &indices_tmp);
+        out_grad_tmp.Resize(out_dims);
+        indices_tmp.Resize(out_dims);
+        // Do transpose
+        TransCompute<platform::CPUDeviceContext, T>(
+            ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis);
+        TransCompute<platform::CPUDeviceContext, int64_t>(
+            ndims, dev_context, indices_tmp, &trans_ind, trans_axis);
+      }
+      const int64_t input_height = framework::product(
+          framework::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1));
+      const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1];
+
+      // Assign the out_grad to tranpose input_grad
+      framework::Tensor tmp_out;
+      T* t_out = tmp_out.mutable_data<T>(trans_in_shape, context.GetPlace());
+      memset(t_out, 0, x_grad->numel() * sizeof(T));
+
+      ModeAssign<T, int64_t>(input_height, input_width, in_dims.size(),
+                             &trans_dO, &trans_ind, t_out);
+
+      // Transpose back
+      TransCompute<platform::CPUDeviceContext, T>(ndims, dev_context, tmp_out,
+                                                  x_grad, trans_axis);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 57231e1135a6a..3b9cf159f1b6b 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -29,20 +29,18 @@ __global__ void AdamKernelREG(MT beta1, MT beta2, MT epsilon, MT beta1_pow_,
   MT beta1_pow = beta1_pow_;
   MT beta2_pow = beta2_pow_;
 
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
-
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
     MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
     MT g = static_cast<MT>(grad[id]);
-    MT mom1 = moment1[id];
-    MT mom2 = moment2[id];
+    MT mom1 = static_cast<MT>(moment1[id]);
+    MT mom2 = static_cast<MT>(moment2[id]);
     mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
     mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-    p -= lr * (mom1 /
-               (sqrt(mom2) + epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
     moment1_out[id] = mom1;
     moment2_out[id] = mom2;
@@ -65,9 +63,6 @@ __global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon,
   MT beta1_pow = *beta1_pow_;
   MT beta2_pow = *beta2_pow_;
 
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
-
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
@@ -77,8 +72,9 @@ __global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon,
     MT mom2 = static_cast<MT>(moment2[id]);
     mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
     mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-    p -= lr * (mom1 /
-               (sqrt(mom2) + epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
     moment1_out[id] = mom1;
     moment2_out[id] = mom2;
@@ -105,8 +101,6 @@ __global__ void SparseAdamCUDAKernelREG(
     int64_t row_numel, int64_t row_count, bool lazy_mode, int ndim) {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   MT lr = *lr_;
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
 
   for (; id < ndim; id += blockDim.x * gridDim.x) {
     auto row_idx =
@@ -122,8 +116,10 @@ __global__ void SparseAdamCUDAKernelREG(
                  : static_cast<MT>(0);
       mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
       mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-      p -= lr * (mom1 / (sqrt(mom2) +
-                         epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+      MT denom =
+          (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+      p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
       // Write back to global memory
       mom1_out_[id] = mom1;
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu
index 49b7fe771be13..8b152bc67a30b 100644
--- a/paddle/fluid/operators/optimizers/adamw_op.cu
+++ b/paddle/fluid/operators/optimizers/adamw_op.cu
@@ -27,25 +27,25 @@ __global__ void AdamWKernelREG(MT beta1, MT beta2, MT epsilon, MT coeff,
                                T* param_out, const MT* master_param,
                                MT* master_param_out, int ndim) {
   MT lr = *lr_ * lr_ratio;
-  MT lr_orig = lr;
   MT beta1_pow = beta1_pow_;
   MT beta2_pow = beta2_pow_;
 
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
-
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
     MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
     MT g = static_cast<MT>(grad[id]);
-    MT mom1 = moment1[id];
-    MT mom2 = moment2[id];
+    MT mom1 = static_cast<MT>(moment1[id]);
+    MT mom2 = static_cast<MT>(moment2[id]);
+
+    p *= (static_cast<MT>(1.0) - lr * coeff);
+
     mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
     mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-    p -= lr_orig * coeff * p;
-    p -= lr * (mom1 /
-               (sqrt(mom2) + epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
     moment1_out[id] = mom1;
     moment2_out[id] = mom2;
@@ -63,13 +63,9 @@ __global__ void AdamWKernelMEM(
     MT* moment2_out, const MT* lr_, const T* grad, const T* param, T* param_out,
     const MT* master_param, MT* master_param_out, int ndim) {
   MT lr = *lr_ * lr_ratio;
-  MT lr_orig = lr;
   MT beta1_pow = *beta1_pow_;
   MT beta2_pow = *beta2_pow_;
 
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
-
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
@@ -77,11 +73,15 @@ __global__ void AdamWKernelMEM(
     MT g = static_cast<MT>(grad[id]);
     MT mom1 = static_cast<MT>(moment1[id]);
     MT mom2 = static_cast<MT>(moment2[id]);
+
+    p *= (static_cast<MT>(1.0) - lr * coeff);
+
     mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
     mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-    p -= lr_orig * coeff * p;
-    p -= lr * (mom1 /
-               (sqrt(mom2) + epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
     moment1_out[id] = mom1;
     moment2_out[id] = mom2;
@@ -109,10 +109,6 @@ __global__ void SparseAdamWCUDAKernelREG(
     int ndim) {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   MT lr = *lr_ * lr_ratio;
-  MT lr_orig = lr;
-
-  lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
-        (static_cast<MT>(1.0) - beta1_pow);
 
   for (; id < ndim; id += blockDim.x * gridDim.x) {
     auto row_idx =
@@ -120,17 +116,23 @@ __global__ void SparseAdamWCUDAKernelREG(
     if (lazy_mode && row_idx < 0) {
       return;
     } else {
-      MT mom1 = mom1_[id];
-      MT mom2 = mom2_[id];
+      MT mom1 = static_cast<MT>(mom1_[id]);
+      MT mom2 = static_cast<MT>(mom2_[id]);
+
       MT p = master_param ? master_param[id] : static_cast<MT>(param_[id]);
       MT g = row_idx >= 0
                  ? static_cast<MT>(grad_[row_idx * row_numel + id % row_numel])
                  : static_cast<MT>(0);
+
+      p *= (static_cast<MT>(1.0) - lr * coeff);
+
       mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
       mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-      p -= lr_orig * coeff * p;
-      p -= lr * (mom1 / (sqrt(mom2) +
-                         epsilon * sqrt(static_cast<MT>(1.0) - beta2_pow)));
+
+      MT denom =
+          (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+
+      p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
 
       // Write back to global memory
       mom1_out_[id] = mom1;
diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h
index 9eba8df9992fc..e3798b49dcbb1 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -52,19 +52,16 @@ struct LambMomentREGUpdateFunctor {
   const bool* skip_update_;
 
   LambMomentREGUpdateFunctor(MT weight_decay, MT beta1, MT beta2, MT epsilon,
-                             MT beta1_pow, MT* beta1_pow_out, MT beta2_pow,
-                             MT* beta2_pow_out, const MT* mom1, MT* mom1_out,
-                             const MT* mom2, MT* mom2_out, const T* grad,
-                             const MT* param, MT* trust_ratio_div,
-                             const bool* skip_update)
+                             MT beta1_pow, MT beta2_pow, const MT* mom1,
+                             MT* mom1_out, const MT* mom2, MT* mom2_out,
+                             const T* grad, const MT* param,
+                             MT* trust_ratio_div, const bool* skip_update)
       : weight_decay_(weight_decay),
         beta1_(beta1),
         beta2_(beta2),
         epsilon_(epsilon),
         beta1_pow_(beta1_pow),
-        beta1_pow_out_(beta1_pow_out),
         beta2_pow_(beta2_pow),
-        beta2_pow_out_(beta2_pow_out),
         moment1_(mom1),
         moment1_out_(mom1_out),
         moment2_(mom2),
@@ -95,10 +92,6 @@ struct LambMomentREGUpdateFunctor {
     trust_ratio_div_[i] =
         mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) +
         weight_decay_ * p;
-    if (beta1_pow_out_ && beta2_pow_out_) {
-      beta1_pow_out_[0] = beta1_pow * beta1_;
-      beta2_pow_out_[0] = beta2_pow * beta2_;
-    }
   }
 };
 
@@ -113,9 +106,7 @@ struct LambMomentMENUpdateFunctor {
   MT epsilon_;
 
   const MT* beta1_pow_;
-  MT* beta1_pow_out_;
   const MT* beta2_pow_;
-  MT* beta2_pow_out_;
   const MT* moment1_;
   MT* moment1_out_;
   const MT* moment2_;
@@ -126,8 +117,7 @@ struct LambMomentMENUpdateFunctor {
   const bool* skip_update_;
 
   LambMomentMENUpdateFunctor(MT weight_decay, MT beta1, MT beta2, MT epsilon,
-                             const MT* beta1_pow, MT* beta1_pow_out,
-                             const MT* beta2_pow, MT* beta2_pow_out,
+                             const MT* beta1_pow, const MT* beta2_pow,
                              const MT* mom1, MT* mom1_out, const MT* mom2,
                              MT* mom2_out, const T* grad, const MT* param,
                              MT* trust_ratio_div, const bool* skip_update)
@@ -136,9 +126,7 @@ struct LambMomentMENUpdateFunctor {
         beta2_(beta2),
         epsilon_(epsilon),
         beta1_pow_(beta1_pow),
-        beta1_pow_out_(beta1_pow_out),
         beta2_pow_(beta2_pow),
-        beta2_pow_out_(beta2_pow_out),
         moment1_(mom1),
         moment1_out_(mom1_out),
         moment2_(mom2),
@@ -168,10 +156,6 @@ struct LambMomentMENUpdateFunctor {
     trust_ratio_div_[i] =
         mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) +
         weight_decay_ * p;
-    if (beta1_pow_out_ && beta2_pow_out_) {
-      beta1_pow_out_[0] = beta1_pow * beta1_;
-      beta2_pow_out_[0] = beta2_pow * beta2_;
-    }
   }
 };
 
@@ -183,9 +167,7 @@ struct SparseLambMomentREGUpdateFunctor {
   T epsilon_;
 
   T beta1_pow_;
-  T* beta1_pow_out_;
   T beta2_pow_;
-  T* beta2_pow_out_;
   const T* moment1_;
   T* moment1_out_;
   const T* moment2_;
@@ -201,20 +183,18 @@ struct SparseLambMomentREGUpdateFunctor {
   const bool* skip_update_;
 
   SparseLambMomentREGUpdateFunctor(T weight_decay, T beta1, T beta2, T epsilon,
-                                   T beta1_pow, T* beta1_pow_out, T beta2_pow,
-                                   T* beta2_pow_out, const T* mom1, T* mom1_out,
-                                   const T* mom2, T* mom2_out, const T* grad,
-                                   const T* param, T* trust_ratio_div,
-                                   const int64_t* rows, int64_t row_numel,
-                                   int64_t row_count, const bool* skip_update)
+                                   T beta1_pow, T beta2_pow, const T* mom1,
+                                   T* mom1_out, const T* mom2, T* mom2_out,
+                                   const T* grad, const T* param,
+                                   T* trust_ratio_div, const int64_t* rows,
+                                   int64_t row_numel, int64_t row_count,
+                                   const bool* skip_update)
       : weight_decay_(weight_decay),
         beta1_(beta1),
         beta2_(beta2),
         epsilon_(epsilon),
         beta1_pow_(beta1_pow),
-        beta1_pow_out_(beta1_pow_out),
         beta2_pow_(beta2_pow),
-        beta2_pow_out_(beta2_pow_out),
         moment1_(mom1),
         moment1_out_(mom1_out),
         moment2_(mom2),
@@ -246,10 +226,6 @@ struct SparseLambMomentREGUpdateFunctor {
     trust_ratio_div_[i] =
         mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) +
         weight_decay_ * p;
-    if (beta1_pow_out_ && beta1_pow_out_) {
-      beta1_pow_out_[0] = beta1_pow * beta1_;
-      beta2_pow_out_[0] = beta2_pow * beta2_;
-    }
   }
 
   inline HOSTDEVICE void operator()(size_t i) const {
@@ -270,9 +246,7 @@ struct SparseLambMomentMENUpdateFunctor {
   T epsilon_;
 
   const T* beta1_pow_;
-  T* beta1_pow_out_;
   const T* beta2_pow_;
-  T* beta2_pow_out_;
   const T* moment1_;
   T* moment1_out_;
   const T* moment2_;
@@ -288,8 +262,7 @@ struct SparseLambMomentMENUpdateFunctor {
   const bool* skip_update_;
 
   SparseLambMomentMENUpdateFunctor(T weight_decay, T beta1, T beta2, T epsilon,
-                                   const T* beta1_pow, T* beta1_pow_out,
-                                   const T* beta2_pow, T* beta2_pow_out,
+                                   const T* beta1_pow, const T* beta2_pow,
                                    const T* mom1, T* mom1_out, const T* mom2,
                                    T* mom2_out, const T* grad, const T* param,
                                    T* trust_ratio_div, const int64_t* rows,
@@ -300,9 +273,7 @@ struct SparseLambMomentMENUpdateFunctor {
         beta2_(beta2),
         epsilon_(epsilon),
         beta1_pow_(beta1_pow),
-        beta1_pow_out_(beta1_pow_out),
         beta2_pow_(beta2_pow),
-        beta2_pow_out_(beta2_pow_out),
         moment1_(mom1),
         moment1_out_(mom1_out),
         moment2_(mom2),
@@ -334,10 +305,6 @@ struct SparseLambMomentMENUpdateFunctor {
     trust_ratio_div_[i] =
         mom1_unbiased / (Eigen::numext::sqrt(mom2_unbiased) + epsilon_) +
         weight_decay_ * p;
-    if (beta1_pow_out_ && beta1_pow_out_) {
-      beta1_pow_out_[0] = beta1_pow * beta1_;
-      beta2_pow_out_[0] = beta2_pow * beta2_;
-    }
   }
 
   inline HOSTDEVICE void operator()(size_t i) const {
@@ -350,11 +317,44 @@ struct SparseLambMomentMENUpdateFunctor {
   }
 };
 
-template <typename T, bool IsMultiPrecision>
-struct LambParamUpateFunctor {
-  using MT = typename std::conditional<
-      IsMultiPrecision, typename details::MPTypeTrait<T>::Type, T>::type;
+template <typename MT, bool NeedUpdateBetaPow /*=true*/>
+struct LambBetaPowUpdateFunctor {
+  void SetBetaPows(const MT* beta1pow, const MT* beta2pow, MT* beta1pow_out,
+                   MT* beta2pow_out, MT beta1, MT beta2) {
+    beta1pow_ = beta1pow;
+    beta2pow_ = beta2pow;
+    beta1pow_out_ = beta1pow_out;
+    beta2pow_out_ = beta2pow_out;
+    beta1_ = beta1;
+    beta2_ = beta2;
+  }
 
+  HOSTDEVICE void UpdateBetaPow(size_t i) const {
+    if (i == 0) {
+      beta1pow_out_[0] = beta1pow_[0] * beta1_;
+      beta2pow_out_[0] = beta2pow_[0] * beta2_;
+    }
+  }
+
+ private:
+  const MT* beta1pow_;
+  const MT* beta2pow_;
+  MT* beta1pow_out_;
+  MT* beta2pow_out_;
+  MT beta1_;
+  MT beta2_;
+};
+
+template <typename MT>
+struct LambBetaPowUpdateFunctor<MT, /*NeedUpdateBetaPow=*/false> {
+  void SetBetaPows(const MT* beta1pow, const MT* beta2pow, MT* beta1pow_out,
+                   MT* beta2pow_out, MT beta1, MT beta2) {}
+  HOSTDEVICE void UpdateBetaPow(size_t) const {}
+};
+
+template <typename T, typename MT, bool IsMultiPrecision, bool UpdateBetaPow>
+struct LambParamUpateFunctor
+    : public LambBetaPowUpdateFunctor<MT, UpdateBetaPow> {
   const MT* lr_;
   const T* param_;
   const MT* master_param_;
@@ -396,6 +396,7 @@ struct LambParamUpateFunctor {
     if (IsMultiPrecision) {
       master_param_out_[i] = param_out;
     }
+    this->UpdateBetaPow(i);
   }
 };
 
@@ -491,9 +492,9 @@ class LambOpKernel : public framework::OpKernel<T> {
     auto trust_ratio_div =
         ctx.AllocateTmpTensor<MT, DeviceContext>(param.dims(), dev_ctx);
 
-    const void* param_ptr = param.template data<void>();
+    const void* param_ptr = param.data();
     const void* master_param_ptr =
-        master_param ? master_param->template data<void>() : nullptr;
+        master_param ? master_param->data() : nullptr;
     void* param_out_ptr = param_out.template mutable_data<T>(ctx.GetPlace());
     void* master_param_out_ptr =
         master_param_out
@@ -501,6 +502,11 @@ class LambOpKernel : public framework::OpKernel<T> {
             : nullptr;
 
     // Update moments
+    bool should_update_beta_pow_later = false;
+    const MT *beta1_pow_ptr = nullptr, *beta2_pow_ptr = nullptr;
+    MT *beta1_pow_out_ptr = nullptr, *beta2_pow_out_ptr = nullptr;
+    VLOG(10) << "Beta1Pow place: " << beta1_pow.place()
+             << " , Beta2Pow place: " << beta2_pow.place();
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto& grad = grad_var->Get<framework::LoDTensor>();
       if (platform::is_gpu_place(ctx.GetPlace()) &&
@@ -508,8 +514,7 @@ class LambOpKernel : public framework::OpKernel<T> {
           beta2_pow.place() == platform::CPUPlace()) {
         LambMomentREGUpdateFunctor<T, IsMultiPrecision> moment_update_functor(
             weight_decay, beta1, beta2, epsilon, *beta1_pow.template data<MT>(),
-            nullptr, *beta2_pow.template data<MT>(), nullptr,
-            mom1.template data<MT>(),
+            *beta2_pow.template data<MT>(), mom1.template data<MT>(),
             mom1_out.template mutable_data<MT>(ctx.GetPlace()),
             mom2.template data<MT>(),
             mom2_out.template mutable_data<MT>(ctx.GetPlace()),
@@ -523,12 +528,17 @@ class LambOpKernel : public framework::OpKernel<T> {
         beta2_pow_out.template mutable_data<MT>(platform::CPUPlace())[0] =
             beta2 * beta2_pow.template data<MT>()[0];
       } else {
+        beta1_pow_ptr = beta1_pow.template data<MT>();
+        beta2_pow_ptr = beta2_pow.template data<MT>();
+        beta1_pow_out_ptr =
+            beta1_pow_out.template mutable_data<MT>(ctx.GetPlace());
+        beta2_pow_out_ptr =
+            beta2_pow_out.template mutable_data<MT>(ctx.GetPlace());
+        should_update_beta_pow_later = true;
         LambMomentMENUpdateFunctor<T, IsMultiPrecision> moment_update_functor(
-            weight_decay, beta1, beta2, epsilon, beta1_pow.template data<MT>(),
-            beta1_pow_out.template mutable_data<MT>(ctx.GetPlace()),
-            beta2_pow.template data<MT>(),
-            beta2_pow_out.template mutable_data<MT>(ctx.GetPlace()),
-            mom1.template data<MT>(),
+            weight_decay, beta1, beta2, epsilon,
+            static_cast<const MT*>(beta1_pow_ptr),
+            static_cast<const MT*>(beta2_pow_ptr), mom1.template data<MT>(),
             mom1_out.template mutable_data<MT>(ctx.GetPlace()),
             mom2.template data<MT>(),
             mom2_out.template mutable_data<MT>(ctx.GetPlace()),
@@ -542,7 +552,12 @@ class LambOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(IsMultiPrecision, false,
                         platform::errors::Unimplemented(
                             "SelectedRows gradient is not supported when "
-                            "multi_precision=True"));
+                            "multi_precision=True."));
+      constexpr bool kIsSameType = std::is_same<T, MT>::value;
+      PADDLE_ENFORCE_EQ(kIsSameType, true,
+                        platform::errors::Unimplemented(
+                            "SelectedRows gradient is not supported when "
+                            "multi_precision=True."));
       auto& grad = GET_DATA_SAFELY(ctx.Input<framework::SelectedRows>("Grad"),
                                    "Input", "Grad", "Lamb");
       if (grad.rows().size() == 0) {
@@ -582,8 +597,8 @@ class LambOpKernel : public framework::OpKernel<T> {
         SparseLambMomentREGUpdateFunctor<T> moment_update_functor(
             static_cast<T>(weight_decay), static_cast<T>(beta1),
             static_cast<T>(beta2), static_cast<T>(epsilon),
-            *beta1_pow.template data<T>(), nullptr,
-            *beta2_pow.template data<T>(), nullptr, mom1.template data<T>(),
+            *beta1_pow.template data<T>(), *beta2_pow.template data<T>(),
+            mom1.template data<T>(),
             mom1_out.template mutable_data<T>(ctx.GetPlace()),
             mom2.template data<T>(),
             mom2_out.template mutable_data<T>(ctx.GetPlace()), grad_data,
@@ -595,14 +610,18 @@ class LambOpKernel : public framework::OpKernel<T> {
         beta2_pow_out.template mutable_data<T>(platform::CPUPlace())[0] =
             static_cast<T>(beta2) * beta2_pow.template data<T>()[0];
       } else {
+        beta1_pow_ptr = beta1_pow.template data<MT>();
+        beta2_pow_ptr = beta2_pow.template data<MT>();
+        beta1_pow_out_ptr =
+            beta1_pow_out.template mutable_data<MT>(ctx.GetPlace());
+        beta2_pow_out_ptr =
+            beta2_pow_out.template mutable_data<MT>(ctx.GetPlace());
+        should_update_beta_pow_later = true;
         SparseLambMomentMENUpdateFunctor<T> moment_update_functor(
             static_cast<T>(weight_decay), static_cast<T>(beta1),
             static_cast<T>(beta2), static_cast<T>(epsilon),
-            beta1_pow.template data<T>(),
-            beta1_pow_out.template mutable_data<T>(ctx.GetPlace()),
-            beta2_pow.template data<T>(),
-            beta2_pow_out.template mutable_data<T>(ctx.GetPlace()),
-            mom1.template data<T>(),
+            reinterpret_cast<const T*>(beta1_pow_ptr),
+            reinterpret_cast<const T*>(beta2_pow_ptr), mom1.template data<T>(),
             mom1_out.template mutable_data<T>(ctx.GetPlace()),
             mom2.template data<T>(),
             mom2_out.template mutable_data<T>(ctx.GetPlace()), grad_data,
@@ -639,14 +658,31 @@ class LambOpKernel : public framework::OpKernel<T> {
     }
     trust_ratio_div_norm.device(*place) = t.square().sum().sqrt();
 
-    LambParamUpateFunctor<T, IsMultiPrecision> param_update_functor(
-        lr.template data<MT>(), static_cast<const T*>(param_ptr),
-        static_cast<const MT*>(master_param_ptr), p_norm_t.template data<MT>(),
-        trust_ratio_div.template data<MT>(),
-        trust_ratio_div_norm_t.template data<MT>(),
-        static_cast<T*>(param_out_ptr), static_cast<MT*>(master_param_out_ptr),
-        skip_update_flag);
-    for_range(param_update_functor);
+#define CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(__should_update_beta_pow)         \
+  do {                                                                       \
+    LambParamUpateFunctor<T, MT, IsMultiPrecision, __should_update_beta_pow> \
+    param_update_functor(                                                    \
+        lr.template data<MT>(), static_cast<const T*>(param_ptr),            \
+        static_cast<const MT*>(master_param_ptr),                            \
+        p_norm_t.template data<MT>(), trust_ratio_div.template data<MT>(),   \
+        trust_ratio_div_norm_t.template data<MT>(),                          \
+        static_cast<T*>(param_out_ptr),                                      \
+        static_cast<MT*>(master_param_out_ptr), skip_update_flag);           \
+    if (__should_update_beta_pow) {                                          \
+      param_update_functor.SetBetaPows(beta1_pow_ptr, beta2_pow_ptr,         \
+                                       beta1_pow_out_ptr, beta2_pow_out_ptr, \
+                                       beta1, beta2);                        \
+    }                                                                        \
+    for_range(param_update_functor);                                         \
+  } while (0)
+
+    if (should_update_beta_pow_later) {
+      CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(true);
+    } else {
+      CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC(false);
+    }
+
+#undef CALL_PADDLE_UPDATE_LAMB_PARAM_FUNC
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cc b/paddle/fluid/operators/optimizers/merged_adam_op.cc
new file mode 100644
index 0000000000000..11c047305c44a
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/optimizers/merged_adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class MergedAdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto param_dtype =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "Param");
+    return framework::OpKernelType(param_dtype, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
+        var_name == "SkipUpdate") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+};
+
+class MergedAdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param", "(Tensor, default Tensor<float>) Input parameter")
+        .AsDuplicable();
+    AddInput("Grad", "(Tensor, default Tensor<float>) Input gradient")
+        .AsDuplicable();
+    AddInput("LearningRate", "(Tensor, default Tensor<float>) Learning rate")
+        .AsDuplicable();
+    AddInput("Moment1", "(Tensor, default Tensor<float>) Input first moment")
+        .AsDuplicable();
+    AddInput("Moment2", "(Tensor, default Tensor<float>) Input second moment")
+        .AsDuplicable();
+    AddInput("Beta1Pow",
+             "(Tensor, default Tensor<float>) Input beta1 power accumulator")
+        .AsDuplicable();
+    AddInput("Beta2Pow",
+             "(Tensor, default Tensor<float>) Input beta2 power accumulator")
+        .AsDuplicable();
+    AddInput("MasterParam", "FP32 master weight for AMP.")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddOutput("ParamOut", "(Tensor) Output parameter").AsDuplicable();
+    AddOutput("Moment1Out", "(Tensor) Output first moment").AsDuplicable();
+    AddOutput("Moment2Out", "(Tensor) Output second moment").AsDuplicable();
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator")
+        .AsDuplicable();
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator")
+        .AsDuplicable();
+    AddOutput("MasterParamOut",
+              "The updated FP32 master weight for AMP. "
+              "It shared memory with Input(MasterParam).")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+    AddAttr<bool>("multi_precision",
+                  "(bool, default false) "
+                  "Whether to use multi-precision during weight updating.")
+        .SetDefault(false);
+    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
+    // as dispensable since they are not used when use_global_beta_pow is true.
+    AddAttr<bool>("use_global_beta_pow",
+                  "(bool, default false) "
+                  "Whether to use global beta_pow for whole model instead of "
+                  "creating beta_pow for each parameter.")
+        .SetDefault(false);
+
+    AddComment(R"DOC(
+Adam Optimizer.
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+Adam updates:
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(merged_adam, ops::MergedAdamOp,
+                             ops::MergedAdamOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(merged_adamw, ops::MergedAdamOp,
+                             ops::MergedAdamOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    merged_adam,
+    ops::MergedAdamOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MergedAdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cu b/paddle/fluid/operators/optimizers/merged_adam_op.cu
new file mode 100644
index 0000000000000..2523fb9e5c680
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cu
@@ -0,0 +1,191 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/optimizers/merged_adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename MT>
+__global__ void AdamKernelREG(MT beta1, MT beta2, MT epsilon, MT beta1_pow_,
+                              MT beta2_pow_, const MT* moment1, MT* moment1_out,
+                              const MT* moment2, MT* moment2_out, const MT* lr_,
+                              const T* grad, const T* param, T* param_out,
+                              const MT* master_param, MT* master_param_out,
+                              int ndim) {
+  MT lr = *lr_;
+  MT beta1_pow = beta1_pow_;
+  MT beta2_pow = beta2_pow_;
+
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; id < ndim; id += gridDim.x * blockDim.x) {
+    MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
+    MT g = static_cast<MT>(grad[id]);
+    MT mom1 = static_cast<MT>(moment1[id]);
+    MT mom2 = static_cast<MT>(moment2[id]);
+    mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
+    mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
+
+    moment1_out[id] = mom1;
+    moment2_out[id] = mom2;
+    param_out[id] = static_cast<T>(p);
+    if (master_param_out) {
+      master_param_out[id] = p;
+    }
+  }
+}
+
+template <typename T, typename MT>
+__global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon,
+                              const MT* beta1_pow_, const MT* beta2_pow_,
+                              const MT* moment1, MT* moment1_out,
+                              const MT* moment2, MT* moment2_out, const MT* lr_,
+                              const T* grad, const T* param, T* param_out,
+                              const MT* master_param, MT* master_param_out,
+                              int ndim) {
+  MT lr = *lr_;
+  MT beta1_pow = *beta1_pow_;
+  MT beta2_pow = *beta2_pow_;
+
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; id < ndim; id += gridDim.x * blockDim.x) {
+    MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
+    MT g = static_cast<MT>(grad[id]);
+    MT mom1 = static_cast<MT>(moment1[id]);
+    MT mom2 = static_cast<MT>(moment2[id]);
+    mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
+    mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
+
+    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
+    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
+
+    moment1_out[id] = mom1;
+    moment2_out[id] = mom2;
+    param_out[id] = static_cast<T>(p);
+    if (master_param_out) {
+      master_param_out[id] = p;
+    }
+  }
+}
+
+template <typename T>
+__global__ void UpdateBetaPow(T beta1, T beta2, const T* beta1_pow_,
+                              const T* beta2_pow_, T* beta1_pow_out,
+                              T* beta2_pow_out) {
+  *beta1_pow_out = beta1 * beta1_pow_[0];
+  *beta2_pow_out = beta2 * beta2_pow_[0];
+}
+
+template <typename T>
+class MergedAdamOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using MPDType = typename details::MPTypeTrait<T>::Type;
+
+    auto param = ctx.MultiInput<framework::Tensor>("Param");
+    auto grad = ctx.MultiInput<framework::Tensor>("Grad");
+    auto lr = ctx.MultiInput<framework::Tensor>("LearningRate");
+    auto mom1 = ctx.MultiInput<framework::Tensor>("Moment1");
+    auto mom2 = ctx.MultiInput<framework::Tensor>("Moment2");
+    auto beta1_pow = ctx.MultiInput<framework::Tensor>("Beta1Pow");
+    auto beta2_pow = ctx.MultiInput<framework::Tensor>("Beta2Pow");
+
+    auto param_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto mom1_out = ctx.MultiOutput<framework::Tensor>("Moment1Out");
+    auto mom2_out = ctx.MultiOutput<framework::Tensor>("Moment2Out");
+    auto beta1_pow_out = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_out = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
+
+    MPDType beta1 = static_cast<MPDType>(ctx.Attr<float>("beta1"));
+    MPDType beta2 = static_cast<MPDType>(ctx.Attr<float>("beta2"));
+    MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    const bool multi_precision = ctx.Attr<bool>("multi_precision");
+    auto master_param = ctx.MultiInput<framework::Tensor>("MasterParam");
+    auto master_param_out =
+        ctx.MultiOutput<framework::Tensor>("MasterParamOut");
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    size_t param_num = param.size();
+    for (size_t idx = 0; idx < param_num; idx++) {
+      const MPDType* master_in_data =
+          multi_precision ? master_param[idx]->data<MPDType>() : nullptr;
+      MPDType* master_out_data =
+          multi_precision
+              ? master_param_out[idx]->mutable_data<MPDType>(ctx.GetPlace())
+              : nullptr;
+
+      // update param and moment
+      int threads = 512;
+      int blocks = (param[idx]->numel() + threads - 1) / threads;
+
+      if (beta1_pow[idx]->place() == platform::CPUPlace() &&
+          beta2_pow[idx]->place() == platform::CPUPlace()) {
+        // Compute with betapow in REG
+        AdamKernelREG<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
+            beta1, beta2, epsilon, *beta1_pow[idx]->data<MPDType>(),
+            *beta2_pow[idx]->data<MPDType>(), mom1[idx]->data<MPDType>(),
+            mom1_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+            mom2[idx]->data<MPDType>(),
+            mom2_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+            lr[idx]->data<MPDType>(), grad[idx]->data<T>(),
+            param[idx]->data<T>(),
+            param_out[idx]->mutable_data<T>(ctx.GetPlace()), master_in_data,
+            master_out_data, param[idx]->numel());
+        if (!use_global_beta_pow) {
+          // Cpu update
+          beta1_pow_out[idx]->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow[idx]->data<MPDType>()[0];
+          beta2_pow_out[idx]->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow[idx]->data<MPDType>()[0];
+        }
+      } else {
+        AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
+            beta1, beta2, epsilon, beta1_pow[idx]->data<MPDType>(),
+            beta2_pow[idx]->data<MPDType>(), mom1[idx]->data<MPDType>(),
+            mom1_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+            mom2[idx]->data<MPDType>(),
+            mom2_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+            lr[idx]->data<MPDType>(), grad[idx]->data<T>(),
+            param[idx]->data<T>(),
+            param_out[idx]->mutable_data<T>(ctx.GetPlace()), master_in_data,
+            master_out_data, param[idx]->numel());
+        if (!use_global_beta_pow) {
+          // Update with gpu
+          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+              beta1, beta2, beta1_pow[idx]->data<MPDType>(),
+              beta2_pow[idx]->data<MPDType>(),
+              beta1_pow_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
+              beta2_pow_out[idx]->mutable_data<MPDType>(ctx.GetPlace()));
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(merged_adam, ops::MergedAdamOpCUDAKernel<float>,
+                        ops::MergedAdamOpCUDAKernel<double>,
+                        ops::MergedAdamOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.h b/paddle/fluid/operators/optimizers/merged_adam_op.h
new file mode 100644
index 0000000000000..c9417158fe772
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+namespace scatter = paddle::operators::math::scatter;
+
+template <typename DeviceContext, typename T>
+class MergedAdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param = ctx.MultiInput<framework::Tensor>("Param");
+    size_t n = param.size();
+    auto grad = ctx.MultiInput<framework::Tensor>("Grad");
+    PADDLE_ENFORCE_EQ(n, grad.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Grad) must be equal to "
+                          "Input(Param), but got the size of Input(Grad) "
+                          "is %d, the size of Input(Param) is %d.",
+                          grad.size(), n));
+    auto lr = ctx.MultiInput<framework::Tensor>("LearningRate");
+    PADDLE_ENFORCE_EQ(
+        n, lr.size(),
+        platform::errors::InvalidArgument(
+            "The size of Input(LearningRate) must be equal to "
+            "Input(Param), but got the size of Input(LearningRate) "
+            "is %d, the size of Input(Param) is %d.",
+            lr.size(), n));
+    auto mom1 = ctx.MultiInput<framework::Tensor>("Moment1");
+    PADDLE_ENFORCE_EQ(n, mom1.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Moment1) must be equal to "
+                          "Input(Param), but got the size of Input(Moment1) "
+                          "is %d, the size of Input(Param) is %d.",
+                          mom1.size(), n));
+    auto mom2 = ctx.MultiInput<framework::Tensor>("Moment2");
+    PADDLE_ENFORCE_EQ(n, mom2.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Moment2) must be equal to "
+                          "Input(Param), but got the size of Input(Moment2) "
+                          "is %d, the size of Input(Param) is %d.",
+                          mom2.size(), n));
+    auto beta1_pow = ctx.MultiInput<framework::Tensor>("Beta1Pow");
+    PADDLE_ENFORCE_EQ(n, beta1_pow.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Beta1Pow) must be equal to "
+                          "Input(Param), but got the size of Input(Beta1Pow) "
+                          "is %d, the size of Input(Param) is %d.",
+                          beta1_pow.size(), n));
+    auto beta2_pow = ctx.MultiInput<framework::Tensor>("Beta2Pow");
+    PADDLE_ENFORCE_EQ(n, beta2_pow.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Input(Beta2Pow) must be equal to "
+                          "Input(Param), but got the size of Input(Beta2Pow) "
+                          "is %d, the size of Input(Param) is %d.",
+                          beta2_pow.size(), n));
+
+    auto param_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto mom1_out = ctx.MultiOutput<framework::Tensor>("Moment1Out");
+    auto mom2_out = ctx.MultiOutput<framework::Tensor>("Moment2Out");
+    auto beta1_pow_out = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_out = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    size_t param_num = param.size();
+    for (size_t idx = 0; idx < param_num; idx++) {
+      AdamFunctor<T, CPUAdam> functor(
+          beta1, beta2, epsilon, beta1_pow[idx]->data<T>(),
+          beta2_pow[idx]->data<T>(), mom1[idx]->data<T>(),
+          mom1_out[idx]->mutable_data<T>(ctx.GetPlace()), mom2[idx]->data<T>(),
+          mom2_out[idx]->mutable_data<T>(ctx.GetPlace()), lr[idx]->data<T>(),
+          grad[idx]->data<T>(), param[idx]->data<T>(),
+          param_out[idx]->mutable_data<T>(ctx.GetPlace()));
+      functor(param[idx]->numel());
+      if (!use_global_beta_pow) {
+        beta1_pow_out[idx]->mutable_data<T>(ctx.GetPlace())[0] =
+            beta1 * beta1_pow[idx]->data<T>()[0];
+        beta2_pow_out[idx]->mutable_data<T>(ctx.GetPlace())[0] =
+            beta2 * beta2_pow[idx]->data<T>()[0];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index e7c09430e9157..28f73e0618c2a 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -126,13 +126,24 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Param", "(Tensor or SelectedRows) Input parameter");
     AddInput("LearningRate", "(Tensor) Learning rate of SGD");
     AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
+    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
     AddOutput("ParamOut",
               "(Tensor or SelectedRows, same with Param) "
               "Output parameter, should share the same memory with Param");
+    AddOutput("MasterParamOut",
+              "The updated FP32 master weight for AMP. "
+              "It shared memory with Input(MasterParam).")
+        .AsDispensable();
+
     AddAttr<bool>(
         "use_mkldnn",
         "(bool, default false) Indicates if MKL-DNN kernel will be used")
         .SetDefault(false);
+    AddAttr<bool>("multi_precision",
+                  "(bool, default false) "
+                  "Whether to use multi-precision during weight updating.")
+        .SetDefault(false);
+
     AddComment(R"DOC(
 
 SGD operator
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 3582e939f30ac..5e3ae6c017bca 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
@@ -21,14 +22,19 @@ namespace operators {
 
 namespace {
 
-template <typename T>
-__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
-                          const int num, T* p_out) {
-  T lr = learning_rate[0];
+template <typename T, typename MT>
+__global__ void SGDKernelMT(const T* param, const T* grad,
+                            const T* learning_rate, const int num, T* param_out,
+                            const MT* master_param, MT* master_param_out) {
+  MT lr = static_cast<MT>(learning_rate[0]);
   CUDA_KERNEL_LOOP(i, num) {
-    T g_data = g[i];
-    T p_data = p[i];
-    p_out[i] = p_data - lr * g_data;
+    MT p_data = master_param ? master_param[i] : static_cast<MT>(param[i]);
+    MT g_data = static_cast<MT>(grad[i]);
+    p_data = p_data - lr * g_data;
+    param_out[i] = static_cast<T>(p_data);
+    if (master_param_out) {
+      master_param_out[i] = p_data;
+    }
   }
 }
 
@@ -63,30 +69,48 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
                           "but the received is %s",
                           ctx.InputNames("Param").front(),
                           paddle::framework::ToTypeName(param_var->Type())));
+    using paddle::framework::Tensor;
+    using MPDType = typename details::MPTypeTrait<T>::Type;
 
     auto* param = ctx.Input<framework::Tensor>("Param");
     auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
     auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
     auto* grad_var = ctx.InputVar("Grad");
+
+    const bool multi_precision = ctx.Attr<bool>("multi_precision");
+    const Tensor* master_param = nullptr;
+    Tensor* master_param_out = nullptr;
+    if (multi_precision) {
+      bool has_master =
+          ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+      PADDLE_ENFORCE_EQ(has_master, true,
+                        platform::errors::InvalidArgument(
+                            "The Input(MasterParam) and Output(MasterParamOut) "
+                            "should not be null when "
+                            "the attr `multi_precision` is true"));
+      master_param = ctx.Input<framework::Tensor>("MasterParam");
+      master_param_out = ctx.Output<framework::Tensor>("MasterParamOut");
+    }
+    const MPDType* master_in_data =
+        multi_precision ? master_param->data<MPDType>() : nullptr;
+    MPDType* master_out_data =
+        multi_precision
+            ? master_param_out->mutable_data<MPDType>(ctx.GetPlace())
+            : nullptr;
+
     // Actually, all tensors are LoDTensor except SelectedRows.
     if (grad_var->IsType<framework::LoDTensor>()) {
-      param_out->mutable_data<T>(ctx.GetPlace());
       auto* grad = ctx.Input<framework::Tensor>("Grad");
-      // LOG(ERROR) << "grad";
-      // LOG(ERROR) << ctx.op().Input("Grad");
-      auto* grad_data = grad->data<T>();
-      // LOG(ERROR) << "param";
-      auto* param_data = param->data<T>();
-      // LOG(ERROR) << "fin";
-      auto* param_out_data = param_out->data<T>();
 
       int block = 512;
       int grid = (param->numel() + block - 1) / block;
 
-      SGDKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
-          grad_data, param_data, learning_rate->data<T>(), param->numel(),
-          param_out_data);
+      SGDKernelMT<
+          T, MPDType><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+          param->data<T>(), grad->data<T>(), learning_rate->data<T>(),
+          param->numel(), param_out->mutable_data<T>(ctx.GetPlace()),
+          master_in_data, master_out_data);
 
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
new file mode 100644
index 0000000000000..40a416dfda4ca
--- /dev/null
+++ b/paddle/fluid/operators/pad_op_npu.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class PadNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto paddings = context.Attr<std::vector<int>>("paddings");
+    float pad_value = context.Attr<float>("pad_value");
+
+    PADDLE_ENFORCE_LT(abs(pad_value), 1e-5,
+                      platform::errors::Unimplemented(
+                          "Ascend npu only support pad_value=0 right now,"
+                          "but received pad_value is %f .",
+                          pad_value));
+
+    out->mutable_data<T>(context.GetPlace());
+
+    NpuOpRunner runner;
+    runner.SetType("Pad")
+        .AddInput(*x)
+        .AddInput(std::move(paddings))
+        .AddOutput(*out);
+
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class PadGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto paddings = context.Attr<std::vector<int>>("paddings");
+
+    d_x->mutable_data<T>(context.GetPlace());
+
+    auto d_x_dims = d_x->dims();
+    auto size = paddle::framework::vectorize(d_x_dims);
+    std::vector<int> offsets(0);
+    int i = 0;
+    for (auto iter = paddings.begin(); iter < paddings.end(); ++iter, ++i) {
+      if (i % 2 == 0) {
+        offsets.push_back(*iter);
+      }
+    }
+
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    const auto& runner = NpuOpRunner("SliceD", {*d_out}, {*d_x},
+                                     {{"offsets", offsets}, {"size", size}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(pad, ops::PadNPUKernel<plat::float16>,
+                       ops::PadNPUKernel<float>, ops::PadNPUKernel<int>);
+
+REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadNPUKernel<plat::float16>,
+                       ops::PadGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc
new file mode 100644
index 0000000000000..cc4b6e5e0756a
--- /dev/null
+++ b/paddle/fluid/operators/poisson_op.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/operators/poisson_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PoissonOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PoissonOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PoissonOp");
+
+    auto dim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class PoissonOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input tensor of poisson op");
+    AddOutput("Out",
+              "The output tensor of poisson op, it has the same shape and "
+              "dtype with input. Each element corresponds to input tensor");
+    AddComment(R"DOC(
+This operator generate random value that obey poisson distribution.
+)DOC");
+  }
+};
+
+class PoissonOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> &GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
+    return m;
+  }
+};
+
+template <typename T>
+class PoissonKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+
+    const T *x_data = x->data<T>();
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int64_t size = x->numel();
+
+    auto gen = framework::DefaultCPUGenerator();
+    auto engine = gen->GetCPUEngine();
+
+    for (int64_t i = 0; i < size; ++i) {
+      std::poisson_distribution<> dist(x_data[i]);
+      out_data[i] = static_cast<T>(dist(*engine));
+    }
+  }
+};
+
+class PoissonGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out_Grad", "PoissonGradOp");
+
+    auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputDim(framework::GradVarName("X"), dout_dim);
+  }
+};
+
+template <typename T>
+class PoissonGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("poisson_grad");
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker,
+                  ops::PoissonOpInferVarType,
+                  ops::PoissonGradOpMaker<paddle::framework::OpDesc>,
+                  ops::PoissonGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(poisson_grad, ops::PoissonGradOp);
+
+REGISTER_OP_CPU_KERNEL(poisson,
+                       ops::PoissonKernel<plat::CPUDeviceContext, float>,
+                       ops::PoissonKernel<plat::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(poisson_grad,
+                       ops::PoissonGradKernel<plat::CPUDeviceContext, float>,
+                       ops::PoissonGradKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/poisson_op.cu b/paddle/fluid/operators/poisson_op.cu
new file mode 100644
index 0000000000000..3f18eb994e145
--- /dev/null
+++ b/paddle/fluid/operators/poisson_op.cu
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+#include "paddle/fluid/operators/poisson_op.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct PoissonCudaFunctor {
+ public:
+  PoissonCudaFunctor(const T* in, T* out, unsigned int seed,
+                     unsigned int offset)
+      : in_(in), out_(out), seed_(seed), offset_(offset) {}
+
+  __device__ void operator()(int64_t idx) {
+#ifdef __NVCC__
+    curandStatePhilox4_32_10_t state;
+    curand_init(seed_, idx, offset_, &state);
+    out_[idx] = static_cast<T>(curand_poisson(&state, in_[idx]));
+#elif __HIPCC__
+    hiprandStatePhilox4_32_10_t state;
+    hiprand_init(seed_, idx, offset_, &state);
+    out_[idx] = static_cast<T>(hiprand_poisson(&state, in_[idx]));
+#endif
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  const unsigned int seed_;
+  const unsigned int offset_;
+};
+
+template <typename T>
+class PoissonKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto size = x->numel();
+    int64_t device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    auto seed_offset = gen_cuda->IncrementOffset(20);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, size);
+
+    PoissonCudaFunctor<T> functor(x_data, out_data, seed, offset);
+    for_range(functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(poisson,
+                        ops::PoissonKernel<plat::CUDADeviceContext, float>,
+                        ops::PoissonKernel<plat::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    poisson_grad, ops::PoissonGradKernel<plat::CUDADeviceContext, float>,
+    ops::PoissonGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/poisson_op.h b/paddle/fluid/operators/poisson_op.h
new file mode 100644
index 0000000000000..2159637b290c9
--- /dev/null
+++ b/paddle/fluid/operators/poisson_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class PoissonKernel;
+
+template <typename DeviceContext, typename T>
+class PoissonGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<DeviceContext, T> functor;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    functor(dev_ctx, dx, static_cast<T>(0));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cc b/paddle/fluid/operators/pull_gpups_sparse_op.cc
new file mode 100644
index 0000000000000..cae3109ea77af
--- /dev/null
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cc
@@ -0,0 +1,148 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/pull_gpups_sparse_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PullGpuPSSparseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_GE(
+        ctx->Inputs("Ids").size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Inputs(Ids) of PullGpuPSSparseOp should not be empty."));
+    PADDLE_ENFORCE_GE(
+        ctx->Outputs("Out").size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Outputs(Out) of PullGpuPSSparseOp should not be empty."));
+    auto embedding_size_vec = ctx->Attrs().Get<std::vector<int>>("size");
+    PADDLE_ENFORCE_EQ(
+        ctx->Inputs("Ids").size(), embedding_size_vec.size(),
+        platform::errors::InvalidArgument("The ids size: %lu must be equal to "
+                                          "the length of embedding size: %lu.",
+                                          ctx->Inputs("Ids").size(),
+                                          embedding_size_vec.size()));
+    auto all_ids_dim = ctx->GetInputsDim("Ids");
+    const size_t n_ids = all_ids_dim.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.resize(n_ids);
+    for (size_t i = 0; i < n_ids; ++i) {
+      int embedding_size = embedding_size_vec[i];
+      const auto ids_dims = all_ids_dim[i];
+      int ids_rank = ids_dims.size();
+      PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                        platform::errors::InvalidArgument(
+                            "Shape error in %lu id, the last dimension of the "
+                            "'Ids' tensor must be 1.",
+                            i));
+      auto out_dim = framework::vectorize(
+          framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+      out_dim.push_back(embedding_size);
+      outs_dims[i] = framework::make_ddim(out_dim);
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+    for (size_t i = 0; i < n_ids; ++i) {
+      ctx->ShareLoD("Ids", "Out", i, i);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class PullGpuPSSparseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.")
+        .AsDispensable();
+    AddInput("Ids",
+             "Input tensors with type int32 or int64 "
+             "contains the ids to be looked up in GpuPS. "
+             "The last dimension size must be 1.")
+        .AsDuplicable();
+    AddOutput("Out", "The lookup results tensors.").AsDuplicable();
+    AddAttr<std::vector<int>>(
+        "size", "(vector<int>, the embedding size of corresponding slot")
+        .SetDefault(std::vector<int>());
+    AddAttr<bool>("is_sparse",
+                  "(boolean, default false) "
+                  "Sparse update.")
+        .SetDefault(false);
+    AddAttr<bool>("is_distributed",
+                  "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Pull GpuPS Sparse Operator.
+
+This operator is used to perform lookups on the GpuPS,
+then concatenated into a dense tensor.
+
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class PushGpuPSSparseOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("push_gpups_sparse");
+    op->SetInput("Ids", this->Input("Ids"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class PushGpuPSSparseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(pull_gpups_sparse, ops::PullGpuPSSparseOp,
+                  ops::PullGpuPSSparseOpMaker,
+                  ops::PushGpuPSSparseOpMaker<paddle::framework::OpDesc>,
+                  ops::PushGpuPSSparseOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(push_gpups_sparse, ops::PushGpuPSSparseOp);
+REGISTER_OP_CPU_KERNEL(pull_gpups_sparse, ops::PullGpuPSSparseCPUKernel<float>,
+                       ops::PullGpuPSSparseCPUKernel<double>)
+REGISTER_OP_CPU_KERNEL(push_gpups_sparse, ops::PushGpuPSSparseCPUKernel<float>,
+                       ops::PushGpuPSSparseCPUKernel<double>)
\ No newline at end of file
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cu b/paddle/fluid/operators/pull_gpups_sparse_op.cu
new file mode 100644
index 0000000000000..a6bca37f5c863
--- /dev/null
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cu
@@ -0,0 +1,48 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/pull_gpups_sparse_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class PullGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PullGpuPSSparseFunctor<T>(ctx);
+  }
+};
+
+template <typename T>
+class PushGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PushGpuPSSparseFunctor<T>(ctx);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(pull_gpups_sparse,
+                        ops::PullGpuPSSparseCUDAKernel<float>,
+                        ops::PullGpuPSSparseCUDAKernel<double>)
+REGISTER_OP_CUDA_KERNEL(push_gpups_sparse,
+                        ops::PushGpuPSSparseCUDAKernel<float>,
+                        ops::PushGpuPSSparseCUDAKernel<double>)
\ No newline at end of file
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
new file mode 100644
index 0000000000000..f721608cffb08
--- /dev/null
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -0,0 +1,104 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
+  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
+  auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
+  const auto slot_size = inputs.size();
+  std::vector<const uint64_t *> all_keys(slot_size);
+  // GpuPSPS only supports float now
+  std::vector<float *> all_values(slot_size);
+  std::vector<int64_t> slot_lengths(slot_size);
+  for (size_t i = 0; i < slot_size; i++) {
+    const auto *slot = inputs[i];
+    const uint64_t *single_slot_keys =
+        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
+    all_keys[i] = single_slot_keys;
+    slot_lengths[i] = slot->numel();
+    auto *output = outputs[i]->mutable_data<T>(ctx.GetPlace());
+    // double type is not fully supported now
+    all_values[i] = reinterpret_cast<float *>(output);
+  }
+#ifdef PADDLE_WITH_HETERPS
+  auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
+  gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths,
+                         0);
+#endif
+}
+
+template <typename T>
+static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
+  auto inputs = ctx.MultiInput<framework::LoDTensor>("Ids");
+  auto d_output =
+      ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
+  const auto slot_size = inputs.size();
+  std::vector<const uint64_t *> all_keys(slot_size);
+  std::vector<const float *> all_grad_values(slot_size);
+  std::vector<int64_t> slot_lengths(slot_size);
+  int batch_size = -1;
+  for (size_t i = 0; i < slot_size; i++) {
+    const auto *slot = inputs[i];
+    const uint64_t *single_slot_keys =
+        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
+    all_keys[i] = single_slot_keys;
+    slot_lengths[i] = slot->numel();
+    int cur_batch_size =
+        slot->lod().size() ? slot->lod()[0].size() - 1 : slot->dims()[0];
+    if (batch_size == -1) {
+      batch_size = cur_batch_size;
+    } else {
+      PADDLE_ENFORCE_EQ(batch_size, cur_batch_size,
+                        platform::errors::PreconditionNotMet(
+                            "The batch size of all input slots should be same, "
+                            "please cheack"));
+    }
+    const float *grad_value = d_output[i]->data<float>();
+    all_grad_values[i] = grad_value;
+  }
+#ifdef PADDLE_WITH_HETERPS
+  auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
+  gpu_ps_ptr->PushSparseGrad(ctx.GetPlace(), 0, all_keys, all_grad_values,
+                             slot_lengths, 0, batch_size);
+#endif
+}
+
+using LoDTensor = framework::LoDTensor;
+template <typename T>
+class PullGpuPSSparseCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PullGpuPSSparseFunctor<T>(ctx);
+  }
+};
+
+template <typename T>
+class PushGpuPSSparseCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PushGpuPSSparseFunctor<T>(ctx);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc
new file mode 100644
index 0000000000000..3557048672742
--- /dev/null
+++ b/paddle/fluid/operators/put_along_axis_op.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/put_along_axis_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class PutAlongAxisOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "PutAlongAxis");
+    OP_INOUT_CHECK(ctx->HasInput("Index"), "Input", "Index", "PutAlongAxis");
+    OP_INOUT_CHECK(ctx->HasInput("Value"), "Input", "Value", "PutAlongAxis");
+    OP_INOUT_CHECK(ctx->HasOutput("Result"), "Output", "Result",
+                   "PutAlongAxis");
+
+    auto index_dim = ctx->GetInputDim("Index");
+
+    ctx->SetOutputDim("Result", index_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+        ctx.device_context());
+  }
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class PutAlongAxisOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "The input tensor of PutAlongAxisOp");
+    AddInput("Index", "The index tensor of PutAlongAxisOp");
+    AddInput("Value", "The value tensor of PutAlongAxisOp");
+    AddOutput("Result", "The result tensor of PutAlongAxisOp");
+    AddAttr<int>("Axis", "The axis that we do PutAlongAxis operation");
+    AddAttr<std::string>("Reduce", "The reduce operation for scatter")
+        .SetDefault("assign");
+    AddComment(R"DOC(
+        PutAlongAxis Operator.)
+    )DOC");
+  }
+};
+
+class PutAlongAxisGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("Input"),
+                      ctx->GetInputDim("Input"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Result")),
+                                   ctx.device_context());
+  }
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class PutAlongAxisGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("put_along_axis_grad");
+    op->SetInput("Index", this->Input("Index"));
+    op->SetInput("Input", this->Input("Input"));
+
+    op->SetInput(framework::GradVarName("Result"), this->OutputGrad("Result"));
+    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("Value"), this->InputGrad("Value"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(PutAlongAxisInplaceInferer, {"Input", "Result"});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker,
+                  ops::PutAlongAxisGradOpMaker<paddle::framework::OpDesc>,
+                  ops::PutAlongAxisGradOpMaker<paddle::imperative::OpBase>,
+                  paddle::operators::PutAlongAxisInplaceInferer);
+
+REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp);
+
+REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel<float>,
+                       ops::PutAlongAxisOpKernel<double>,
+                       ops::PutAlongAxisOpKernel<int>,
+                       ops::PutAlongAxisOpKernel<uint8_t>,
+                       ops::PutAlongAxisOpKernel<int64_t>);
+
+REGISTER_OP_CPU_KERNEL(put_along_axis_grad,
+                       ops::PutAlongAxisGradOpKernel<float>,
+                       ops::PutAlongAxisGradOpKernel<double>,
+                       ops::PutAlongAxisGradOpKernel<int>,
+                       ops::PutAlongAxisGradOpKernel<uint8_t>,
+                       ops::PutAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu
new file mode 100644
index 0000000000000..da36b564337da
--- /dev/null
+++ b/paddle/fluid/operators/put_along_axis_op.cu
@@ -0,0 +1,134 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/put_along_axis_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class PutAlongAxisCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::PreconditionNotMet(
+                          "PutAlongAxisCUDAKernel only runs on GPU device."));
+    auto input = ctx.Input<Tensor>("Input");
+    auto axis = ctx.Attr<int>("Axis");
+    auto value = ctx.Input<Tensor>("Value");
+    auto index = ctx.Input<Tensor>("Index");
+    auto reduce_op = ctx.Attr<std::string>("Reduce");
+    auto result = ctx.Output<Tensor>("Result");
+    const platform::DeviceContext &device_ctx = ctx.device_context();
+
+    const auto &index_type = index->type();
+
+    framework::TensorCopy(*input, ctx.GetPlace(), result);
+    if (reduce_op == "add") {
+      if (index_type == framework::proto::VarType::INT32) {
+        gpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
+                                           device_ctx);
+      } else if (index_type == framework::proto::VarType::INT64) {
+        gpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
+                                           device_ctx);
+      }
+    } else if (reduce_op == "multiply" || reduce_op == "mul") {
+      if (index_type == framework::proto::VarType::INT32) {
+        gpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
+                                           device_ctx);
+      } else if (index_type == framework::proto::VarType::INT64) {
+        gpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
+                                           device_ctx);
+      }
+    } else if (reduce_op == "assign") {
+      if (index_type == framework::proto::VarType::INT32) {
+        gpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
+                                              device_ctx);
+      } else if (index_type == framework::proto::VarType::INT64) {
+        gpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
+                                              device_ctx);
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "can not support reduce_op: '%s' for scatter kernel, only "
+          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
+          "defalut reduce op is 'assign' ",
+          reduce_op));
+      return;
+    }
+  }
+};
+
+template <typename T>
+class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::PreconditionNotMet(
+                          "PutAlongAxisGradOpCUDAKernel only runs on GPU."));
+
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
+    auto index = ctx.Input<Tensor>("Index");
+    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
+    auto axis = ctx.Attr<int>("Axis");
+
+    const auto &index_type = index->type();
+    if (input_grad) {
+      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
+      if (index_type == framework::proto::VarType::INT32) {
+        gpu_scatter_input_grad_kernel<T, int32_t>(
+            *result_grad, axis, *index, *input_grad, ctx.device_context());
+      } else {
+        gpu_scatter_input_grad_kernel<T, int64_t>(
+            *result_grad, axis, *index, *input_grad, ctx.device_context());
+      }
+    }
+    if (value_grad) {
+      value_grad->Resize(index->dims());
+      value_grad->mutable_data<T>(ctx.GetPlace());
+      if (index_type == framework::proto::VarType::INT32) {
+        gpu_gather_kernel<T, int32_t>(
+            *result_grad, axis, *index, *value_grad,
+            ctx.device_context());  // the gradient of scatter is gather
+      } else if (index_type == framework::proto::VarType::INT64) {
+        gpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
+                                      ctx.device_context());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel<float>,
+                        ops::PutAlongAxisCUDAKernel<double>,
+                        ops::PutAlongAxisCUDAKernel<int64_t>,
+                        ops::PutAlongAxisCUDAKernel<int>,
+                        ops::PutAlongAxisCUDAKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(put_along_axis_grad,
+                        ops::PutAlongAxisGradOpCUDAKernel<float>,
+                        ops::PutAlongAxisGradOpCUDAKernel<double>,
+                        ops::PutAlongAxisGradOpCUDAKernel<int64_t>,
+                        ops::PutAlongAxisGradOpCUDAKernel<int>,
+                        ops::PutAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h
new file mode 100644
index 0000000000000..f23ca177db9c5
--- /dev/null
+++ b/paddle/fluid/operators/put_along_axis_op.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class PutAlongAxisOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
+                      platform::errors::PreconditionNotMet(
+                          "PutAlongAxisOpKernel only runs on CPU."));
+
+    auto input = ctx.Input<Tensor>("Input");
+    auto axis = ctx.Attr<int>("Axis");
+    auto value = ctx.Input<Tensor>("Value");
+    auto index = ctx.Input<Tensor>("Index");
+    auto reduce_op = ctx.Attr<std::string>("Reduce");
+    auto result = ctx.Output<Tensor>("Result");
+
+    framework::TensorCopy(*input, ctx.GetPlace(), result);
+    const platform::DeviceContext &device_ctx = ctx.device_context();
+    const auto &index_type = index->type();
+    if (reduce_op == "add") {
+      if (index_type == framework::proto::VarType::INT32) {
+        cpu_scatter_add_kernel<T, int32_t>(*result, axis, *index, *value,
+                                           device_ctx);
+      } else if (index_type == framework::proto::VarType::INT64) {
+        cpu_scatter_add_kernel<T, int64_t>(*result, axis, *index, *value,
+                                           device_ctx);
+      }
+    } else if (reduce_op == "multiply" || reduce_op == "mul") {
+      if (index_type == framework::proto::VarType::INT32) {
+        cpu_scatter_mul_kernel<T, int32_t>(*result, axis, *index, *value,
+                                           device_ctx);
+      } else if (index_type == framework::proto::VarType::INT64) {
+        cpu_scatter_mul_kernel<T, int64_t>(*result, axis, *index, *value,
+                                           device_ctx);
+      }
+    } else if (reduce_op == "assign") {
+      if (index_type == framework::proto::VarType::INT32) {
+        cpu_scatter_assign_kernel<T, int32_t>(*result, axis, *index, *value,
+                                              device_ctx);
+      } else if (index_type == framework::proto::VarType::INT64) {
+        cpu_scatter_assign_kernel<T, int64_t>(*result, axis, *index, *value,
+                                              device_ctx);
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "can not support reduce_op: '%s' for scatter kernel, only "
+          "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the "
+          "defalut reduce "
+          "op is 'assign' ",
+          reduce_op));
+      return;
+    }
+  }
+};
+
+template <typename T>
+class PutAlongAxisGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
+                      platform::errors::PreconditionNotMet(
+                          "PutAlongAxisGradOpKernel only runs on CPU."));
+
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto value_grad = ctx.Output<Tensor>(framework::GradVarName("Value"));
+    auto index = ctx.Input<Tensor>("Index");
+    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
+    auto axis = ctx.Attr<int>("Axis");
+    const auto &index_type = index->type();
+
+    if (input_grad) {
+      framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad);
+      if (index_type == framework::proto::VarType::INT32) {
+        cpu_scatter_input_grad_kernel<T, int32_t>(
+            // Here passing an unused argument *result_grad, because it's
+            // convenient to instantiate a bunch of template function with the
+            // same arguments list.
+            *result_grad, axis, *index, *input_grad, ctx.device_context());
+      } else {
+        cpu_scatter_input_grad_kernel<T, int64_t>(
+            *result_grad, axis, *index, *input_grad, ctx.device_context());
+      }
+    }
+
+    if (value_grad) {
+      value_grad->Resize(index->dims());
+      value_grad->mutable_data<T>(ctx.GetPlace());
+      if (index_type == framework::proto::VarType::INT32) {
+        cpu_gather_kernel<T, int32_t>(*result_grad, axis, *index, *value_grad,
+                                      ctx.device_context());
+      } else if (index_type == framework::proto::VarType::INT64) {
+        cpu_gather_kernel<T, int64_t>(*result_grad, axis, *index, *value_grad,
+                                      ctx.device_context());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 6c28daa7eac72..3c0c8ad1cafce 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -132,7 +132,7 @@ void BufferedReader::ReadAsync(size_t i) {
 
             memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
                          BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
-                         cpu[i].data<void>(), size);
+                         cpu[i].data(), size);
 
             cuda[i].set_lod(cpu[i].lod());
           } else {
@@ -175,7 +175,7 @@ void BufferedReader::ReadAsync(size_t i) {
         platform::RecordEvent record_event("BufferedReader:MemoryCopy");
         for (size_t i = 0; i < cpu.size(); ++i) {
           auto cpu_place = cpu[i].place();
-          auto cpu_ptr = cpu[i].data<void>();
+          auto cpu_ptr = cpu[i].data();
           auto gpu_ptr = gpu_ptrs[i];
           auto size =
               cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
@@ -239,7 +239,7 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::RecordEvent record_event("BufferedReader:MemoryCopy");
       for (size_t i = 0; i < cpu.size(); ++i) {
         auto cpu_place = cpu[i].place();
-        auto cpu_ptr = cpu[i].data<void>();
+        auto cpu_ptr = cpu[i].data();
         auto npu_ptr = npu_ptrs[i];
         auto size =
             cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
index 4464b95712557..33e195f899209 100644
--- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
+++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
 
 namespace paddle {
 namespace operators {
@@ -39,9 +39,9 @@ TEST(test_reduce_rank_check, all) {
       }
 
       if (is_valid) {
-        CheckReduceRank(reduce_rank, rank);
+        pten::kernels::details::CheckReduceRank(reduce_rank, rank);
       } else {
-        ASSERT_THROW(CheckReduceRank(reduce_rank, rank),
+        ASSERT_THROW(pten::kernels::details::CheckReduceRank(reduce_rank, rank),
                      paddle::platform::EnforceNotMet);
       }
     }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
new file mode 100644
index 0000000000000..c5bc66e23ce8a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_amax);
+REGISTER_OP_CPU_KERNEL(
+    reduce_amax, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                   ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_amax_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                            float, ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::AMaxOrAMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
new file mode 100644
index 0000000000000..16c7a4794bb50
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+// reduce_max
+REGISTER_OP_CUDA_KERNEL(
+    reduce_amax,
+    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>,
+    ops::ReduceCudaKernel<double, kps::MaxFunctor, kps::IdentityFunctor>,
+    ops::ReduceCudaKernel<int, kps::MaxFunctor, kps::IdentityFunctor>,
+    ops::ReduceCudaKernel<int64_t, kps::MaxFunctor, kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
new file mode 100644
index 0000000000000..27f2e2b70c681
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_amax_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::AMaxOrAMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
new file mode 100644
index 0000000000000..027bf8ea00a9b
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_amin);
+REGISTER_OP_CPU_KERNEL(
+    reduce_amin, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                   ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_amin_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                            float, ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::AMaxOrAMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
new file mode 100644
index 0000000000000..f9f015804e11d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+// reduce_min
+REGISTER_OP_CUDA_KERNEL(
+    reduce_amin,
+    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>,
+    ops::ReduceCudaKernel<double, kps::MinFunctor, kps::IdentityFunctor>,
+    ops::ReduceCudaKernel<int, kps::MinFunctor, kps::IdentityFunctor>,
+    ops::ReduceCudaKernel<int64_t, kps::MinFunctor, kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
new file mode 100644
index 0000000000000..a296c4c5d6fa1
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reduce_amin_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::AMaxOrAMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
deleted file mode 100644
index 72d21d7074e88..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <limits>
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#ifdef __HIPCC__
-#include <hip/hip_runtime.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-namespace kps = paddle::operators::kernel_primitives;
-
-template <typename Tx, typename Ty = Tx>
-struct CustomMin {
-  using Transformer = kps::IdentityFunctor<Tx>;
-
-  inline Ty initial() {
-    return static_cast<Ty>(std::numeric_limits<Ty>::max());
-  }
-
-  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
-    return (b < a) ? b : a;
-  }
-};
-
-template <typename Tx, typename Ty = Tx>
-struct CustomMax {
-  using Transformer = kps::IdentityFunctor<Tx>;
-
-  inline Ty initial() {
-    return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
-  }
-
-  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
-    return (b > a) ? b : a;
-  }
-};
-
-// for cub::Reduce
-template <typename Tx, typename Ty = Tx>
-struct CustomSum {
-  using Transformer = kps::IdentityFunctor<Tx, Ty>;
-
-  inline Ty initial() { return static_cast<Ty>(0.0f); }
-
-  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
-    return b + a;
-  }
-};
-
-template <typename Tx, typename Ty = Tx>
-struct CustomSub {
-  using Transformer = kps::InverseFunctor<Tx>;
-
-  inline Ty initial() { return static_cast<Ty>(0.0f); }
-
-  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
-    return b + a;
-  }
-};
-
-template <typename Tx, typename Ty = Tx>
-struct CustomMean {
-  using Transformer = kps::DivideFunctor<Tx, Ty>;
-
-  inline Ty initial() { return static_cast<Ty>(0.0f); }
-
-  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
-    return b + a;
-  }
-};
-
-template <typename Tx, typename Ty = Tx>
-struct CustomMul {
-  using Transformer = kps::IdentityFunctor<Tx>;
-
-  inline Ty initial() { return static_cast<Ty>(1.0f); }
-
-  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
-    return b * a;
-  }
-};
-
-template <typename Tx, typename Ty = Tx>
-struct CustomLogicalOr {
-  using Transformer = kps::IdentityFunctor<Tx>;
-
-  inline Ty initial() { return static_cast<Ty>(false); }
-
-  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
-    return b || a;
-  }
-};
-
-template <typename Tx, typename Ty = Tx>
-struct CustomLogicalAnd {
-  using Transformer = kps::IdentityFunctor<Tx>;
-
-  inline Ty initial() { return static_cast<Ty>(true); }
-
-  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
-    return b && a;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
index d6c1dc5f02d42..6f64a055d3471 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc
@@ -14,11 +14,12 @@
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+
 namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
@@ -41,15 +42,24 @@ class ReduceMeanXPUKernel : public framework::OpKernel<T> {
       xdims.push_back(input->dims()[i]);
     }
     auto rdims = context.Attr<std::vector<int>>("dim");
+    const auto& input_dim_size = input->dims().size();
+    std::vector<int> reduce_dims;
     if (reduce_all) {
-      rdims.clear();
       for (size_t i = 0; i < xdims.size(); i++) {
-        rdims.push_back(static_cast<int>(i));
+        reduce_dims.push_back(static_cast<int>(i));
+      }
+    } else {
+      for (size_t i = 0; i < rdims.size(); ++i) {
+        if (rdims[i] < 0) {
+          reduce_dims.push_back(rdims[i] + input_dim_size);
+        } else {
+          reduce_dims.push_back(rdims[i]);
+        }
       }
     }
     int r = xpu::reduce_mean(
         dev_ctx.x_context(), reinterpret_cast<const XPUType*>(input->data<T>()),
-        reinterpret_cast<XPUType*>(output->data<T>()), xdims, rdims);
+        reinterpret_cast<XPUType*>(output->data<T>()), xdims, reduce_dims);
 
     PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                       platform::errors::External(
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h b/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h
index 2557e8dd48861..c36b4d7467658 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h
@@ -46,5 +46,95 @@ struct MaxOrMinGradFunctor {
   }
 };
 
+#define HANDLE_AXIS_DIM(BROADCAST_DIM, AXIS_DIM)                      \
+  if (broadcast_dim_size == BROADCAST_DIM && rank == AXIS_DIM) {      \
+    AMaxOrAMinAxisIsListGradFunctor<DeviceContext, X, Y, DX, DY, Dim, \
+                                    BROADCAST_DIM, AXIS_DIM>(         \
+        place, x, y, dx, dy, dim, axis_dim);                          \
+  }
+
+template <typename DeviceContext, typename X, typename Y, typename DX,
+          typename DY, typename Dim, int R, int D>
+void AMaxOrAMinAxisIsListGradFunctor(const DeviceContext& place, X* x, Y* y,
+                                     DX* dx, DY* dy, const Dim& dim,
+                                     const std::vector<int>& axis_dim) {
+  // R is x->dimensions().size();
+  // D is axis_dim->dimensions().size();
+  auto axis = Eigen::array<int, D>();
+  auto reshape_x = Eigen::array<int, R>();
+  auto reshape_y = Eigen::array<int, R>();
+
+  for (int i = 0; i < D; i++) axis[i] = axis_dim[i];
+  for (int i = 0; i < R; i++) {
+    reshape_x[i] = x->dimensions()[i];
+    reshape_y[i] = y->dimensions()[i];
+  }
+
+  auto equals = (*x) == y->broadcast(dim);
+  auto ones = dx->constant(1);
+  auto zeros = dx->constant(0);
+  auto mask = equals.select(ones, zeros);
+  dx->device(place) =
+      dy->broadcast(dim) * mask /
+      mask.reshape(reshape_x).sum(axis).reshape(reshape_y).broadcast(dim);
+}
+
+struct AMaxOrAMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    auto mask = equals.select(ones, zeros);
+
+    // If there are multiple minimum or maximum elements,
+    // we evenly distribute gradient between these equal values
+    size_t x_numel = 1;
+    for (size_t i = 0; i < x->dimensions().size(); i++)
+      x_numel *= x->dimensions()[i];
+    // reduce_all
+    if (size == static_cast<int>(x_numel)) {
+      auto equal_number = mask.sum()
+                              .reshape(Eigen::array<int, 1>({1}))
+                              .broadcast(Eigen::array<int, 1>({size}));
+      dx->device(place) = dy->broadcast(dim) * mask / equal_number;
+      return;
+    }
+
+    // compute forward reduce axis_dim by dim (which is broadcast_dim)
+    std::vector<int> axis_dim;
+    int broadcast_dim_size = static_cast<int>(dim.size());
+    for (int i = 0; i < broadcast_dim_size; i++) {
+      if (dim[i] > 1) {
+        axis_dim.push_back(i);
+      }
+    }
+
+    int rank = static_cast<int>(axis_dim.size());
+    // axis is a int element
+    if (rank == 1) {
+      auto axis = Eigen::array<int, 1>({axis_dim[0]});
+      dx->device(place) =
+          dy->broadcast(dim) * mask /
+          mask.sum(axis).reshape(dy->dimensions()).broadcast(dim);
+      return;
+    }
+    // axis is list, HANDLE_AXIS_DIM(broadcast_dim_size, rank)
+    HANDLE_AXIS_DIM(3, 2);
+    HANDLE_AXIS_DIM(4, 2);
+    HANDLE_AXIS_DIM(4, 3);
+    // comments for accelerating compiling temporarily.
+    // HANDLE_AXIS_DIM(5, 2);
+    // HANDLE_AXIS_DIM(5, 3);
+    // HANDLE_AXIS_DIM(5, 4);
+    // HANDLE_AXIS_DIM(6, 2);
+    // HANDLE_AXIS_DIM(6, 3);
+    // HANDLE_AXIS_DIM(6, 4);
+    // HANDLE_AXIS_DIM(6, 5);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 5a82176a9c980..62486f62f66f8 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -29,952 +29,28 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/framework/array.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/cast_op.h"
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/fast_divmod.h"
-#include "paddle/fluid/string/string_helper.h"
 
-// Reduce split or not, Whether to use ReduceHigherDim
-#define REDUCE_SPLIT_BOUNDARY 512
-#define REDUCE_VEC_SIZE 4
-
-namespace kps = paddle::operators::kernel_primitives;
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
 
 namespace paddle {
 namespace operators {
 
-namespace details {
-
-static inline int GetLastPow2(int n) {
-  n |= (n >> 1);
-  n |= (n >> 2);
-  n |= (n >> 4);
-  n |= (n >> 8);
-  n |= (n >> 16);
-  return std::max(1, n - (n >> 1));
-}
-
-static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
-
-// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
-static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
-                                             const std::vector<int>& idx) {
-  int n = static_cast<int>(idx.size());
-  if (n == 0) return std::vector<int>();
-  std::vector<int> strides(n);
-  strides.back() = 1;
-  for (int i = n - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * dims[idx[i + 1]];
-  }
-  return strides;
-}
-
-// get blockDim for reduceLastDim and reduceAny
-static inline int GetBlockDim(int block_dim) {
-  return block_dim >= kps::details::kReduceMaxThread
-             ? kps::details::kReduceMaxThread
-             : GetLastPow2(block_dim);
-}
-
-// check reduce rand is valid
-static inline void CheckReduceRank(int reduce_rank, int rank) {
-  if (rank % 2 == 0) {
-    PADDLE_ENFORCE_EQ(reduce_rank, rank / 2,
-                      platform::errors::InvalidArgument(
-                          "ReduceOp: invalid reduce rank. When rank = %d, "
-                          "reduce_rank must be %d, but got %d.",
-                          rank, rank / 2, reduce_rank));
-  } else {
-    auto lower_rank = (rank - 1) / 2;
-    auto upper_rank = (rank + 1) / 2;
-    PADDLE_ENFORCE_EQ(
-        reduce_rank == lower_rank || reduce_rank == upper_rank, true,
-        platform::errors::InvalidArgument(
-            "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank "
-            "must be %d or %d, but got %d.",
-            rank, lower_rank, upper_rank, reduce_rank));
-  }
-}
-
-// convert dims from vector to array
-template <typename T, size_t ElementCount, typename VectorLikeType>
-static inline paddle::framework::Array<T, ElementCount> VectorToArray(
-    const VectorLikeType& vec) {
-  PADDLE_ENFORCE_LE(vec.size(), ElementCount,
-                    platform::errors::InvalidArgument(
-                        "Cub reduce Array: size not match. Received "
-                        "vec.size() %d > ElementCount %d.",
-                        vec.size(), ElementCount));
-  size_t n = static_cast<size_t>(vec.size());
-  paddle::framework::Array<T, ElementCount> ret;
-  for (size_t i = 0; i < n; ++i) {
-    ret[i] = vec[i];
-  }
-  return ret;
-}
-
-}  // namespace details
-
-using Tensor = framework::Tensor;
-constexpr int kMaxRank = framework::DDim::kMaxRank;
-
-enum ReduceType {
-  kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
-  kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
-  kReduceAny = 0x03,        // when reduce_dim.size() > 1
-};
-
-struct IndexCalculator {
-  IndexCalculator(int dim, const std::vector<int>& cal_dims,
-                  const std::vector<int>& cal_strides,
-                  const std::vector<int>& full_strides)
-      : dim(dim) {
-    dims = details::VectorToArray<int, kMaxRank>(cal_dims);
-    strides = details::VectorToArray<int, kMaxRank>(full_strides);
-    std::vector<platform::FastDivMod> cal_divmoders;
-    // fast divmod
-    for (auto i : cal_strides) {
-      cal_divmoders.push_back(platform::FastDivMod(i));
-    }
-    divmoders =
-        details::VectorToArray<platform::FastDivMod, kMaxRank>(cal_divmoders);
-  }
-
-  __device__ inline int operator()(int offset) const {
-    int index = 0;
-#pragma unroll
-    for (int i = 0; i < kMaxRank; ++i) {
-      if (i == dim) {
-        break;
-      }
-      auto divmod = divmoders[i].Divmod(offset);
-      index += (divmod.val[0] * strides[dims[i]]);
-      offset = divmod.val[1];
-    }
-    return index;
-  }
-
-  int dim;
-  framework::Array<int, kMaxRank> dims;
-  framework::Array<int, kMaxRank> strides;
-  framework::Array<platform::FastDivMod, kMaxRank> divmoders;
-};
-
-template <bool ReduceLastDim = false>
-struct ReduceIndexMapping {
-  const kps::DimConfig dim;
-  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
-      : dim(dims) {}
-
-  __device__ __forceinline__ int BlockIdX() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return (cluster_id() / dim.split_num_x % dim.split_num_y);
-    } else {
-      return cluster_id() % dim.split_num_x;
-    }
-#else
-    return blockIdx.x;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockIdY() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return (cluster_id() % dim.split_num_x);
-    } else {
-      return (cluster_id() / dim.split_num_x % dim.split_num_y);
-    }
-#else
-    return blockIdx.y;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockDimX() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_x;
-#else
-    return blockDim.x;
-#endif
-  }
-
-  __device__ __forceinline__ int BlockDimY() {
-#ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_y;
-#else
-    return blockDim.y;
-#endif
-  }
-
-  __device__ __forceinline__ int GridDimX() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.split_num_y;
-    } else {
-      return dim.split_num_x;
-    }
-#else
-    return gridDim.x;
-#endif
-  }
-
-  __device__ __forceinline__ int GridDimY() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.split_num_x;
-    } else {
-      return dim.split_num_y;
-    }
-#else
-    return gridDim.y;
-#endif
-  }
-
-  __device__ __forceinline__ int GetLoopSize() {
-#ifdef PADDLE_WITH_XPU2
-    if (ReduceLastDim) {
-      return dim.deal_size_y;
-    } else {
-      return dim.deal_size_x;
-    }
-#else
-    return 1;
-#endif
-  }
-};
-
-// when reduce_type == kReduceLastDim this struct will be used
-// for higher performance
-struct OneDimIndexCal {
-  explicit OneDimIndexCal(int num) : stride(num) {}
-
-  __device__ inline int operator()(int index) const { return index * stride; }
-  int stride;
-};
-
-// reduce config
-template <typename Ty>
-struct ReduceConfig {
-  ReduceConfig(const std::vector<int>& origin_reduce_dims,
-               const std::vector<int>& origin_x_dim)
-      : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
-
-  // get the parameters of reduceKernel
-  void Run() {
-    // step1: update the reduce_dim left_dim and x_dim
-    SetReduceDim();
-
-    // step2: get the strides of dim for reduceAny and reduceLastDim
-    SetStrides();
-
-    // step3: get the type of reduce
-    SetReduceType();
-
-    // step4: set the block and grid for launch kernel
-    SetBlockDim();
-  }
-
-  // when should_reduce_again is true, we need malloc temp space for temp data
-  void SetOutputData(Ty* y_data, const platform::Place& place,
-                     framework::Tensor* tmp) {
-    if (should_reduce_again) {
-      output_data = tmp->mutable_data<Ty>(
-          framework::make_ddim(
-              {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}),
-          place);
-    } else {
-      output_data = y_data;
-    }
-  }
-
- private:
-  // set reduce_dim, left_dim and update x_dim
-  // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
-  //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
-  void SetReduceDim() {
-    std::set<int> reduce_set;
-    for (auto e : reduce_dims_origin) {
-      auto pos = e >= 0 ? e : e + x_dim.size();
-      reduce_set.insert(pos);
-    }
-
-    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
-    std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
-
-    // update reduce_dim and x_dim
-    std::vector<int> x_new_dim;
-
-    reduce_dim.push_back(reduce_dim_temp[0]);
-    x_new_dim.push_back(x_dim[0]);
-
-    int idx_reduce = 1;
-    int num = 0;
-
-    if (reduce_dim_temp.size() > 1) {
-      for (int i = 1; i < x_dim.size(); i++) {
-        if ((idx_reduce < reduce_dim_temp.size()) &&
-            (i == reduce_dim_temp[idx_reduce])) {
-          int result =
-              reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
-          bool is_equal = ((result - num) == 1);
-          if (is_equal) {
-            x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
-            num++;
-          } else {
-            reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num);
-            x_new_dim.push_back(x_dim[i]);
-          }
-          idx_reduce++;
-        } else {
-          x_new_dim.push_back(x_dim[i]);
-        }
-      }
-    } else {
-      x_new_dim = x_dim;
-    }
-
-    // update x_dim
-    x_dim = x_new_dim;
-    std::vector<int>().swap(x_new_dim);
-
-    std::vector<int> reduce_dim_new;
-    int is_reduced = 0;
-    for (auto e : reduce_dim) {
-      is_reduced |= 1 << e;
-    }
-
-    std::vector<int>().swap(reduce_dim);
-
-    for (int i = 0; i < x_dim.size(); i++) {
-      if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
-        x_new_dim.push_back(x_dim[i]);
-        if ((is_reduced >> i) & 1)
-          reduce_dim_new.push_back(x_new_dim.size() - 1);
-      } else {
-        x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
-      }
-    }
-
-    x_dim = x_new_dim;
-    reduce_dim = reduce_dim_new;
-
-    int x_rank = static_cast<int>(x_dim.size());
-    std::set<int> left_set;
-
-    for (int i = 0; i < x_rank; ++i) {
-      left_set.insert(i);
-    }
-
-    for (auto e : reduce_dim) {
-      left_set.erase(e);
-    }
-
-    left_dim.assign(left_set.begin(), left_set.end());
-
-    // if the last dim gets involved in reduction
-    reduce_last_dim = (reduce_dim.back() == x_dim.size() - 1);
-  }
-
-  // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny
-  // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1]
-  //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
-  //     left_strides = [1]
-  void SetStrides() {
-    std::vector<int> idx_dim;
-    for (int i = 0; i < x_dim.size(); i++) {
-      idx_dim.push_back(i);
-    }
-
-    x_strides = details::GetDimStrides(x_dim, idx_dim);
-    reduce_strides = details::GetDimStrides(x_dim, reduce_dim);
-    left_strides = details::GetDimStrides(x_dim, left_dim);
-    reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
-
-    left_num = 1;
-    if (left_dim.size()) {
-      left_num = left_strides[0] * x_dim[left_dim[0]];
-    }
-  }
-
-  // get the reduceType
-  // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim
-  //     x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim
-  //     x_dim = [8] reduce_dim = [0] --> reduceAll
-  //     x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny
-  void SetReduceType() {
-    int rank = x_dim.size();
-    int reduce_rank = reduce_dim.size();
-    bool is_last_dim =
-        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
-    if (rank == reduce_rank || is_last_dim) {
-      reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
-    } else if (reduce_rank == 1) {
-// ReduceFirstDim and reduceSecondDim
-#ifdef PADDLE_WITH_XPU2
-      if (reduce_dim[0] == 0) {
-        reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
-      } else {
-        reduce_type = static_cast<int>(ReduceType::kReduceAny);
-      }
-#else
-      reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
-#endif
-    } else {
-      reduce_type = static_cast<int>(ReduceType::kReduceAny);
-    }
-  }
-
-  void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
-    constexpr int min_reduce_num_per_thread = 16;
-    constexpr int max_reduce_num_per_thread = 256;
-    constexpr int max_num_threads = kps::details::kReduceMaxThread;
-
-    // set block size.
-    // 1. If reduce_last_dim == true, all the threads whose threadIdx.y are same
-    //    will process the reduction for one output.
-    //    The number of output for one block is blockDim.y;
-    // 2. If reduce_last_dim == false, different threadIdx.x will process
-    //    different reduction and gets the output separately. If it is
-    //    necessary, it should reduce in block y.
-    //    The number of output for one block is blockDim.x;
-    int block_x, block_y;
-    int grid_num, reduce_num_per_thread;
-    if (reduce_last_dim) {
-      block_x = details::GetBlockDim(reduce_num);
-      block_y = details::GetBlockDim(left_num);
-      block_dim->x = block_x;
-      block_dim->y =
-          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
-      grid_num = details::AlignUp(left_num, block_dim->y);
-      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->x);
-    } else {
-      block_x = details::GetBlockDim(left_num);
-      block_y = details::GetBlockDim(reduce_num);
-      block_dim->x = std::min(block_x, 32);
-      block_dim->y =
-          std::min(block_y, static_cast<int>(max_num_threads / block_dim->x));
-      block_dim->x =
-          std::min(block_x, static_cast<int>(max_num_threads / block_dim->y));
-      grid_num = details::AlignUp(left_num, block_dim->x);
-      reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
-    }
-    int device_id = platform::GetCurrentDeviceId();
-    int max_mp = platform::GetGPUMultiProcessors(device_id);
-    int max_threads_per_mp =
-        platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
-    int max_threads = max_threads_per_mp * max_mp;
-    int num_threads = block_dim->x * block_dim->y;
-    int max_num_blocks = max_threads / num_threads;
-
-    // set grid size.
-    // Whether to set grid.y larger than 1, there are 3 following rules:
-    // 1. The number that each thread process should no less than
-    //    min_reduce_num_per_threadbut no more than max_reduce_num_per_thread;
-    // 2. It should maximize the utilization of SM.
-    // So we choose the minimum between input_split_num_1 and input_split_num_3
-    // to make each thread process as mush data as possible. Meanwhile,
-    // the number cannot be larger than max_reduce_num_per_thread, so we
-    // choose the maximum between the result above and input_split_num_2.
-    int input_split_num_1 =
-        details::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread);
-    int input_split_num_2 =
-        details::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread);
-    int input_split_num_3 = details::AlignUp(max_num_blocks, grid_num);
-
-    grid_dim->x = grid_num;
-    grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3),
-                           input_split_num_2);
-    // if grid.y > 1, we need launch reduce kernel again.
-    if (grid_dim->y > 1) {
-      should_reduce_again = true;
-    }
-  }
-
-  // set block and grid for launch kernel
-  // for ReduceHigherDim: if block is enough -> splite reduce_num
-  //                     else init block(32, 1) grid(block_num, 1)
-  // for others: block(block_num, 1) , grid(left_num, 1)
-  void SetBlockDimForHigher(dim3* block_dim, dim3* grid_dim) {
-    int last_dim_num = x_dim.back();
-    // update left_num
-    int grid_z = left_num / last_dim_num;
-    left_num = last_dim_num;
-    grid_dim->z = grid_z;
-    int device_id = platform::GetCurrentDeviceId();
-    int max_mp = platform::GetGPUMultiProcessors(device_id);
-    int max_threads_per_mp =
-        platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
-    int max_threads = max_threads_per_mp * max_mp;
-    // init
-    int num_block = (max_threads / left_num);
-    block_dim->x = details::GetBlockDim(left_num);
-    grid_dim->x = details::AlignUp(left_num, block_dim->x);
-    blocking_size = reduce_num;
-
-    if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
-      blocking_size = details::GetLastPow2(reduce_num / num_block);
-      if (blocking_size <= 1) {
-        blocking_size = details::GetLastPow2(sqrt(reduce_num));
-      } else if (blocking_size * 2 < reduce_num) {
-        blocking_size *= 2;
-      }
-      should_reduce_again = true;
-      grid_dim->y = details::AlignUp(reduce_num, blocking_size);
-    }
-  }
-
-  void SetBlockDim() {
-    // init
-    int block_num = details::GetBlockDim(reduce_num);
-    should_reduce_again = false;
-    dim3 block_dim(block_num, 1, 1);
-    dim3 grid_dim(left_num, 1, 1);
-    blocking_size = reduce_num;
-#ifdef PADDLE_WITH_XPU2
-    if (reduce_last_dim) {
-      block_dim.x = 128;
-      block_dim.y = reduce_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
-    } else {
-      block_dim.x = 128;
-      block_dim.y = left_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
-    }
-#else
-    if (reduce_type == ReduceType::kReduceHigherDim) {
-      SetBlockDimForHigher(&block_dim, &grid_dim);
-    } else {
-      SetBlockDimForReduceAny(&block_dim, &grid_dim);
-    }
-#endif
-
-    block = block_dim;
-    grid = grid_dim;
-  }
-
- public:
-  std::vector<int> reduce_dims_origin;
-  std::vector<int> reduce_dim;
-  std::vector<int> x_dim;
-  std::vector<int> left_dim;
-  std::vector<int> x_strides;
-  std::vector<int> left_strides;
-  std::vector<int> reduce_strides;
-
-  int reduce_type;
-  int reduce_num;
-  int left_num;
-  int blocking_size;
-  bool should_reduce_again;
-  bool reduce_last_dim;
-
-  Ty* output_data;
-
-  dim3 block;
-  dim3 grid;
-};
-
-// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
-// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
-// function will be used
-template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
-          typename TransformOp, typename Calculator>
-__global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer,
-                                TransformOp transformer, MPType init,
-                                int reduce_num, int left_num,
-                                bool reduce_last_dim,
-                                const Calculator reduce_index_calculator,
-                                const Calculator left_index_calculator,
-                                const kps::DimConfig dim) {
-  int input_idx, left_idx, stride;
-  int block_size = 0;
-  bool need_store = true;
-  int loop_left = 0;
-  int tid = 0;
-  // the last dim gets involved in reduction
-  int store_offset = 0;
-  int stride_left = 0;
-  if (reduce_last_dim) {
-    auto block = ReduceIndexMapping<true>(dim);
-    input_idx = block.BlockIdY() * block.BlockDimX();
-    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
-    stride = block.GridDimY() * block.BlockDimX();
-    block_size = block.BlockDimX();
-    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
-    store_offset = block.BlockIdY() * left_num + left_idx;
-    loop_left = min(block.GetLoopSize(), left_num - left_idx);
-    stride_left = 1;
-    tid = threadIdx.x;
-  } else {
-    auto block = ReduceIndexMapping<false>(dim);
-    input_idx = block.BlockIdY() * block.BlockDimY();
-    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
-    stride = block.GridDimY() * block.BlockDimY();
-    block_size = block.BlockDimY();
-    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
-    loop_left = min(block.GetLoopSize(), left_num - left_idx);
-    stride_left = block.BlockDimX() * block.GridDimX();
-    store_offset = block.BlockIdY() * left_num + left_idx;
-    tid = threadIdx.y;
-  }
-  // calculate the offset, means the addr where each thread really start.
-  // 1. reduce for each thread
-  MPType input_compute[REDUCE_VEC_SIZE];
-  Tx input_reg[REDUCE_VEC_SIZE];
-  for (int i = 0; i < loop_left; i += stride_left) {
-    int input_offset = left_index_calculator(left_idx + i);
-    const Tx* input = x + input_offset;
-    MPType reduce_var = init;
-    // load REDUCE_VEC_SIZE data once, and then compute
-    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
-    for (; input_idx + block_size < bound;
-         input_idx += REDUCE_VEC_SIZE * stride) {
-      kps::ReadDataReduce<Tx, Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator,
-                          kps::IdentityFunctor<Tx>, false>(
-          &input_reg[0], input, input_idx, reduce_index_calculator, 1,
-          reduce_num, 1, stride, kps::IdentityFunctor<Tx>(), reduce_last_dim);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &input_compute[0], &input_reg[0], transformer);
-      kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-    }
-
-    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
-    kps::ReadDataReduce<Tx, MPType, 1, REDUCE_VEC_SIZE, 1, 1, Calculator,
-                        TransformOp, true>(
-        &input_compute[0], input, input_idx, reduce_index_calculator, 1,
-        reduce_num - input_idx, 1, stride, transformer, reduce_last_dim);
-    kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
-                kps::details::ReduceMode::kLocalMode>(
-        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
-
-    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
-        &reduce_var, &reduce_var, reducer, reduce_last_dim);
-    if (need_store) {
-      y[store_offset + i] = static_cast<Ty>(reduce_var);
-    }
-  }
-}
-
-template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
-          typename TransformOp>
-__global__ void ReduceHigherDimKernel(const Tx* x, Ty* y, ReduceOp reducer,
-                                      TransformOp transformer, MPType init,
-                                      int reduce_num, int left_num,
-                                      int blocking_size,
-                                      const kps::DimConfig dim) {
-  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
-  // function will be used
-  auto block = ReduceIndexMapping<false>(dim);
-  int idy = block.BlockIdY() * blocking_size;
-  int idx = block.BlockIdX() * block.BlockDimX();
-  int idz = BLOCK_ID_Z * left_num;
-  int stride = dim.split_num_x * dim.deal_size_x;
-  int size = left_num - dim.rem_x;
-  int loop_size = min(reduce_num - idy, blocking_size);
-  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
-  int block_offset = idy * left_num + idz * reduce_num;
-  const Tx* input = x + block_offset;
-  Tx reduce_input;
-  for (; idx < size; idx += stride) {
-    MPType reduce_var = init;
-    MPType reduce_compute = init;
-    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
-                                            input + loop_idx * left_num + idx,
-                                            block.BlockDimX(), 1, 1, left_num);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType, 1, 1, 1, ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &reduce_compute, reducer, false);
-    }
-    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, false>(y + store_offset + idx, &result,
-                                       block.BlockDimX());
-  }
-
-  if (idx < left_num) {
-    MPType reduce_var = init;
-    MPType reduce_compute = init;
-    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
-      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
-                                           input + loop_idx * left_num + idx,
-                                           dim.rem_x, 1, 1, left_num);
-      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
-          &reduce_compute, &reduce_input, transformer);
-      kps::Reduce<MPType, 1, 1, 1, ReduceOp,
-                  kps::details::ReduceMode::kLocalMode>(
-          &reduce_var, &reduce_compute, reducer, false);
-    }
-    Ty result = static_cast<Ty>(reduce_var);
-    kps::WriteData<Ty, 1, 1, 1, true>(y + store_offset + idx, &result,
-                                      dim.rem_x);
-  }
-}
-
-template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
-          typename TransformOp>
-static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
-                               const ReduceOp& reducer,
-                               const TransformOp& transform, MPType init,
-                               gpuStream_t stream, ReduceConfig<Ty> config) {
-  if (config.reduce_type == kReduceLastDim) {
-    int stride_reduce = 1;
-    int stride_left = config.reduce_num;
-    // for higher performance
-    auto reduce_index_calculator = OneDimIndexCal(stride_reduce);
-    auto left_index_calculator = OneDimIndexCal(stride_left);
-
-    kps::DimConfig dim =
-        kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
-                       config.block.x, config.block.y, 0);
-    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
-                    OneDimIndexCal><<<8, 128, stream>>>(
-        x_data, config.output_data, reducer, transform, init, config.reduce_num,
-        config.left_num, config.reduce_last_dim, reduce_index_calculator,
-        left_index_calculator, dim);
-#else
-    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
-                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
-        x_data, config.output_data, reducer, transform, init, config.reduce_num,
-        config.left_num, config.reduce_last_dim, reduce_index_calculator,
-        left_index_calculator, dim);
-#endif
-
-  } else {
-    int reduce_rank = config.reduce_strides.size();
-    int left_rank = config.left_strides.size();
-    auto reduce_index_calculator =
-        IndexCalculator(reduce_rank, config.reduce_dim, config.reduce_strides,
-                        config.x_strides);
-    auto left_index_calculator = IndexCalculator(
-        left_rank, config.left_dim, config.left_strides, config.x_strides);
-
-    kps::DimConfig dim =
-        kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
-                       config.block.x, config.block.y, 0);
-    dim.SetRem(config.reduce_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
-                    IndexCalculator><<<8, 128, stream>>>(
-        x_data, config.output_data, reducer, transform, init, config.reduce_num,
-        config.left_num, config.reduce_last_dim, reduce_index_calculator,
-        left_index_calculator, dim);
-#else
-    ReduceAnyKernel<Tx, Ty, MPType, ReduceOp, TransformOp,
-                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
-        x_data, config.output_data, reducer, transform, init, config.reduce_num,
-        config.left_num, config.reduce_last_dim, reduce_index_calculator,
-        left_index_calculator, dim);
-#endif
-  }
-
-  if (config.should_reduce_again) {
-    dim3 block;
-    dim3 grid;
-    if (config.reduce_last_dim) {
-      block = dim3(32, 1, 1);
-      grid = dim3(details::AlignUp(config.left_num, 32), 1, 1);
-    } else {
-      block = dim3(config.block.x, 1, 1);
-      grid = dim3(config.grid.x, 1, config.grid.z);
-    }
-
-    auto last_index = OneDimIndexCal(1);
-    auto first_index = OneDimIndexCal(config.left_num);
-    kps::DimConfig dim =
-        kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-    dim.SetRem(config.left_num % block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
-    ReduceHigherDimKernel<Ty, Ty, MPType, ReduceOp,
-                          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
-        config.output_data, y_data, reducer, kps::IdentityFunctor<Ty, MPType>(),
-        init, config.grid.y, config.left_num, config.grid.y, dim);
-#else
-    ReduceHigherDimKernel<
-        Ty, Ty, MPType, ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
-        config.output_data, y_data, reducer, kps::IdentityFunctor<Ty, MPType>(),
-        init, config.grid.y, config.left_num, config.grid.y, dim);
-#endif
-  }
-}
-
-template <typename Tx, typename Ty, template <typename> class ReduceOp,
-          typename TransformOp>
-static typename std::enable_if<!std::is_same<Tx, platform::float16>::value,
-                               void>::type
-CubTensorReduceFunctorImpl(const Tx* x_data, Ty* y_data,
-                           const TransformOp& transform, int reduce_num,
-                           const platform::Place& place, gpuStream_t stream) {
-  auto reducer = ReduceOp<Ty>();
-  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
-                                                                  transform);
-  size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
-                            reduce_num, reducer, reducer.initial(), stream);
-  framework::Tensor tmp;
-  auto* temp_storage = tmp.mutable_data<uint8_t>(
-      framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}), place);
-  cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
-                            reduce_num, reducer, reducer.initial(), stream);
-}
-
-template <typename Tx, typename Ty, template <typename> class ReduceOp,
-          typename TransformOp>
-static typename std::enable_if<std::is_same<Tx, platform::float16>::value,
-                               void>::type
-CubTensorReduceFunctorImpl(const Tx* x_data, Ty* y_data,
-                           const TransformOp& transform, int reduce_num,
-                           const platform::Place& place, gpuStream_t stream) {
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
-}
-
 template <typename Tx, typename Ty, template <typename> class ReduceOp,
           typename TransformOp>
 void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
                              const TransformOp& transform,
                              const std::vector<int>& origin_reduce_dims,
                              gpuStream_t stream) {
-  auto x_dim = framework::vectorize<int>(x.dims());
-  auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
-  config.Run();
-  int numel = x.numel();
-  // after config.run()
-  // SetOutputData for ReduceHigherDim when should_reduce_again is true,
-  // temp_output should be stored temp_data in output_data space or stored in
-  // y_data;
-  framework::Tensor tmp;
-  auto x_data = x.data<Tx>();
-  auto y_data = y->mutable_data<Ty>(x.place());
-
-  if (config.reduce_num == 1) {
-    auto out_dims = y->dims();
-    if (x.type() == y->type()) {
-      framework::TensorCopy(x, y->place(), y);
-      y->Resize(out_dims);
-    } else {
-      auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
-          paddle::platform::DeviceContextPool::Instance().Get(x.place()));
-      framework::VisitDataType(
-          static_cast<framework::proto::VarType::Type>(y->type()),
-          CastOpFunctor<platform::CUDADeviceContext, Tx>(&x, y, *dev_ctx));
-    }
-    return;
-  }
-
-  config.SetOutputData(y_data, x.place(), &tmp);
-  constexpr bool kIsTxFP16 = std::is_same<Tx, paddle::platform::float16>::value;
-  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
-  if (use_cub_reduce) {
-    CubTensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
-        x_data, y_data, transform, config.reduce_num, x.place(), stream);
-    return;
-  }
+  y->mutable_data<Ty>(x.place());
 
-  using MPType = typename details::MPTypeTrait<Ty>::Type;
-  auto reducer = ReduceOp<MPType>();
-  // launch ReduceHigherDimKernel
-  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
-  // function will be used
-  // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
-  //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx /
-  //     32
-  //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
-  if (config.reduce_type == ReduceType::kReduceHigherDim) {
-    kps::DimConfig dim =
-        kps::DimConfig(config.grid.x, config.grid.y, config.grid.z,
-                       config.block.x, config.blocking_size, 0);
-    dim.SetRem(config.left_num % config.block.x,
-               config.reduce_num % config.blocking_size, 0);
+  auto pt_x = paddle::experimental::MakePtenDenseTensor(x);
+  auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
 
-#ifdef PADDLE_WITH_XPU2
-    ReduceHigherDimKernel<Tx, Ty, MPType, ReduceOp<MPType>,
-                          TransformOp><<<8, 128, stream>>>(
-        x_data, config.output_data, reducer, transform, reducer.initial(),
-        config.reduce_num, config.left_num, config.blocking_size, dim);
-#else
-    ReduceHigherDimKernel<
-        Tx, Ty, MPType, ReduceOp<MPType>,
-        TransformOp><<<config.grid, config.block, 0, stream>>>(
-        x_data, config.output_data, reducer, transform, reducer.initial(),
-        config.reduce_num, config.left_num, config.blocking_size, dim);
-#endif
-
-    if (config.should_reduce_again) {
-      dim3 block = dim3(config.block.x, 1, 1);
-      dim3 grid = dim3(config.grid.x, 1, config.grid.z);
-      kps::DimConfig dim2 =
-          kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
-      dim2.SetRem(config.left_num % config.block.x, 0, 0);
-
-#ifdef PADDLE_WITH_XPU2
-      ReduceHigherDimKernel<
-          Ty, Ty, MPType, ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
-          config.output_data, y_data, reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y), reducer.initial(),
-          config.grid.y, config.left_num, config.grid.y, dim2);
-#else
-      ReduceHigherDimKernel<
-          Ty, Ty, MPType, ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
-          config.output_data, y_data, reducer,
-          kps::IdentityFunctor<Ty, MPType>(config.grid.y), reducer.initial(),
-          config.grid.y, config.left_num, config.grid.y, dim2);
-#endif
-    }
-    return;
-  }
-
-  // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
-  // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
-  // function will be used
-  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
-      x_data, y_data, reducer, transform, reducer.initial(), stream, config);
+  pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
+      *pt_x.get(), pt_y.get(), transform, origin_reduce_dims, stream);
 }
 
-template <typename Tx, template <typename> class ReduceOp,
-          template <typename, typename> class TransformOp>
-struct TensorReduceFunc {
-  const framework::Tensor& x;
-  framework::Tensor* y;
-  std::vector<int> origin_reduce_dims;
-  gpuStream_t stream;
-  int reduce_num;
-  TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
-                   std::vector<int> origin_reduce_dims, int num_reduce,
-                   gpuStream_t stream)
-      : x(x),
-        y(y),
-        origin_reduce_dims(origin_reduce_dims),
-        reduce_num(num_reduce),
-        stream(stream) {}
-
-  template <typename Ty>
-  void apply() const {
-    using MPType = typename details::MPTypeTrait<Ty>::Type;
-    TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp<Tx, MPType>>(
-        x, y, TransformOp<Tx, MPType>(reduce_num), origin_reduce_dims, stream);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index d3b938272e692..e1854d8a13d8b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -28,10 +28,10 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/math.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
+#include "paddle/pten/kernels/cpu/reduce.h"
 
 #if defined(__HIPCC__) || defined(__NVCC__)
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
 #endif
 
 namespace paddle {
@@ -259,7 +259,7 @@ class ReduceKernel : public framework::OpKernel<T> {
     std::vector<int64_t> tmp_dims(dims.begin(), dims.end());
 
     // call new kernel
-    pten::general::Reduce<DeviceContext, T, Functor>(
+    pten::Reduce<DeviceContext, T, Functor>(
         dev_ctx, *pt_x.get(), reduce_all, tmp_dims, keep_dim,
         pten::TransToPtenDataType(cast_out_dtype), pt_out.get());
   }
@@ -700,24 +700,28 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
     auto out_dtype = context.Attr<int>("out_dtype");
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
 
-    std::vector<int> reduce_dims =
-        GetReduceDim(dims, input->dims().size(), reduce_all);
-    int reduce_num = 1;
-    for (auto i : reduce_dims) {
-      reduce_num *= (input->dims())[i];
-    }
-    gpuStream_t stream = context.cuda_device_context().stream();
+    auto& dev_ctx = context.cuda_device_context();
+
     if (out_dtype >= 0) {
-      framework::VisitDataTypeSmall(
-          static_cast<framework::proto::VarType::Type>(out_dtype),
-          TensorReduceFunc<T, ReduceOp, TransformOp>(
-              *input, output, reduce_dims, reduce_num, stream));
+      output->mutable_data(
+          dev_ctx.GetPlace(),
+          static_cast<framework::proto::VarType::Type>(out_dtype));
     } else {
-      using MPType = typename details::MPTypeTrait<T>::Type;
-      TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
-          *input, output, TransformOp<T, MPType>(reduce_num), reduce_dims,
-          stream);
+      output->mutable_data(
+          dev_ctx.GetPlace(),
+          static_cast<framework::proto::VarType::Type>(input->type()));
     }
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*input);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*output);
+    std::vector<int64_t> dims_int64{dims.begin(), dims.end()};
+
+    auto pt_out_dtype = pten::TransToPtenDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype));
+
+    pten::Reduce<T, ReduceOp, TransformOp>(dev_ctx, *pt_x.get(), reduce_all,
+                                           dims_int64, false, pt_out_dtype,
+                                           pt_out.get());
   }
 };
 #endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_xpu.cc
new file mode 100644
index 0000000000000..ae7e1317323dd
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_xpu.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ReduceProdXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("This kernel only runs on XPU."));
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    std::vector<int> xdims;
+    for (int i = 0; i < input->dims().size(); i++) {
+      xdims.push_back(input->dims()[i]);
+    }
+    auto rdims = context.Attr<std::vector<int>>("dim");
+    const auto& input_dim_size = input->dims().size();
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      for (size_t i = 0; i < xdims.size(); i++) {
+        reduce_dims.push_back(static_cast<int>(i));
+      }
+    } else {
+      for (size_t i = 0; i < rdims.size(); ++i) {
+        if (rdims[i] < 0) {
+          reduce_dims.push_back(rdims[i] + input_dim_size);
+        } else {
+          reduce_dims.push_back(rdims[i]);
+        }
+      }
+    }
+    int r = xpu::reduce_prod(
+        dev_ctx.x_context(), reinterpret_cast<const XPUType*>(input->data<T>()),
+        reinterpret_cast<XPUType*>(output->data<T>()), xdims, reduce_dims);
+
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU reduce_prod kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_XPU_KERNEL(
+    reduce_prod,
+    ops::ReduceProdXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/renorm_op.cc b/paddle/fluid/operators/renorm_op.cc
new file mode 100644
index 0000000000000..b15193e0e99d8
--- /dev/null
+++ b/paddle/fluid/operators/renorm_op.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/renorm_op.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class RenormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  using DDim = paddle::framework::DDim;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "abs");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "abs");
+
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class RenormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of renorm op.");
+    AddOutput("Out", "(Tensor), The output tensor of renorm op.");
+    AddAttr<float>("p", "(float, norm's power");
+    AddAttr<int>("axis",
+                 "int,the dimension to slice over to get the sub-tensors");
+    AddAttr<float>("max_norm", "(float, the norm upper-bound");
+    AddAttr<bool>("use_cudnn",
+                  "(bool, default false) Only used in cudnn kernel, need "
+                  "install cudnn")
+        .SetDefault(false);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Renorm Operator.
+
+This operator is used to scale tensor sliced by axis if its p-norm execeeds maxnorm
+
+)DOC");
+  }
+};
+
+class RenormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@Grad", "AbsGrad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   "X@Grad", "AbsGrad");
+
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputDim(framework::GradVarName("X"), dout_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class RenormGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("renorm_grad");
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetInput("X", this->Input("X"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(renorm, ops::RenormOp, ops::RenormOpMaker,
+                  ops::RenormGradMaker<paddle::framework::OpDesc>,
+                  ops::RenormGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(renorm_grad, ops::RenormGradOp);
+
+REGISTER_OP_CPU_KERNEL(renorm, ops::CPURenormKernel<float>,
+                       ops::CPURenormKernel<double>);
+
+REGISTER_OP_CPU_KERNEL(renorm_grad, ops::CPURenormGradKernel<float>,
+                       ops::CPURenormGradKernel<double>);
\ No newline at end of file
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
new file mode 100644
index 0000000000000..1798faa759bed
--- /dev/null
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -0,0 +1,238 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/renorm_op.h"
+
+#include <algorithm>
+#include <cstdio>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+__device__ __forceinline__ float inline_pow(float base, float exponent) {
+  return pow(base, exponent);
+}
+
+__device__ __forceinline__ double inline_pow(double base, double exponent) {
+  return pow(base, exponent);
+}
+
+__device__ __forceinline__ float inline_abs(float x) { return abs(x); }
+__device__ __forceinline__ double inline_abs(double x) { return abs(x); }
+
+template <typename Tx, typename Ty = Tx>
+struct UnsignedPowFunctor {
+  HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) {
+    this->porder = porder;
+  }
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(inline_pow(inline_abs(x), static_cast<Tx>(porder)));
+  }
+  float porder;
+};
+
+template <typename T>
+__global__ void RenormKernelFunc3(int64_t size, T* dim_value, float p,
+                                  float max_norm) {
+  int64_t i = ((int64_t)blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i < size) {
+    T temp = pow(dim_value[i], (T)(1.0 / p));
+    dim_value[i] = 1.0;
+    if (temp > max_norm) dim_value[i] = max_norm / temp;
+  }
+}
+
+template <typename T>
+__global__ void RenormKernelFunc4(const T* x_data, T* out_data, int64_t size,
+                                  T* dim_value, int64_t dimension_each,
+                                  int64_t dim_divisor) {
+  int64_t i = ((int64_t)blockIdx.x) * blockDim.x + threadIdx.x;
+  auto dim_index = i / dim_divisor % dimension_each;
+  if (i < size) {
+    if (dim_value[dim_index] < 1.0)
+      out_data[i] = dim_value[dim_index] * x_data[i];
+    else
+      out_data[i] = x_data[i];
+  }
+}
+
+template <typename T>
+__global__ void RenormGradKernelFunc1(const T* x_data, const T* dout_data,
+                                      T* pow_value, T* mul_value, int64_t size,
+                                      int64_t dimension_each, float p,
+                                      int64_t dim_divisor) {
+  int64_t i = ((int64_t)blockIdx.x) * blockDim.x + threadIdx.x;
+  auto dim_index = i / dim_divisor % dimension_each;
+  if (i < size) {
+    pow_value[i] = pow(abs(x_data[i]), (T)p);
+    mul_value[i] = x_data[i] * dout_data[i];
+  }
+}
+
+template <typename T>
+__global__ void RenormGradKernelFunc2(const T* x_data, const T* dout_data,
+                                      T* dx_data, int64_t size, T* dim_value,
+                                      T* dim_power_sum, T* weight_derivative,
+                                      int64_t dimension_each, float p,
+                                      float max_norm, int64_t dim_divisor) {
+  int64_t i = ((int64_t)blockIdx.x) * blockDim.x + threadIdx.x;
+  auto dim_index = i / dim_divisor % dimension_each;
+  if (i < dimension_each) {
+    dim_power_sum[i] = 0;
+    auto temp = pow(dim_value[i], (T)(1.0 / p));
+    if (temp > max_norm) {
+      dim_power_sum[i] = pow(dim_value[i], (T)(-1.0 - 1.0 / p)) * -1 * max_norm;
+      dim_value[i] = max_norm / temp;
+    } else {
+      dim_value[i] = 1.0;
+    }
+  }
+  __syncthreads();
+  if (i < size) {
+    dx_data[i] = dim_value[dim_index] * dout_data[i];
+    dx_data[i] = dx_data[i] +
+                 weight_derivative[dim_index] * dim_power_sum[dim_index] *
+                     pow(abs(x_data[i]), T(p - 1.0)) *
+                     (x_data[i] >= 0 ? 1 : -1);
+  }
+}
+
+template <typename T>
+class CUDARenormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    auto numel = x->numel();
+    const T* x_data = x->data<T>();
+    auto input_dims = x->dims();
+    float max_norm = context.Attr<float>("max_norm");
+    float p = context.Attr<float>("p");
+    int dim = context.Attr<int>("axis");
+    auto dimension_each = input_dims[dim];
+    auto dim_size = input_dims.size();
+    framework::Tensor pow_value, dim_value;
+    int64_t dim_divisor = 1, pre_mul = 1;
+    for (int i = dim + 1; i < dim_size; i++) dim_divisor *= input_dims[i];
+    for (int i = 0; i < dim; i++) pre_mul *= input_dims[i];
+    pow_value.Resize(
+        framework::make_ddim({pre_mul, dimension_each, dim_divisor}));
+    dim_value.Resize(framework::make_ddim({dimension_each}));
+    pow_value.mutable_data<T>(context.GetPlace());
+    out->Resize(framework::make_ddim(framework::vectorize(input_dims)));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto stream = context.cuda_device_context().stream();
+    int block = std::min(numel, static_cast<int64_t>(256));
+    using MT = typename details::MPTypeTrait<T>::Type;
+    int grid = (numel + block - 1) / block;
+
+    int block2 = std::min(dimension_each, static_cast<int64_t>(256));
+    int grid2 = (dimension_each + block2 - 1) / block2;
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {&pow_value};
+    auto func = UnsignedPowFunctor<MT, T>(p);
+    const auto& cuda_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, MT, T,
+                                        UnsignedPowFunctor<MT, T>>(
+        cuda_ctx, ins, &outs, func);
+    std::vector<int> reduce_axis = {0, 2};
+    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+    RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
+        numel, dim_value.mutable_data<T>(context.GetPlace()), p, max_norm);
+    RenormKernelFunc4<T><<<grid, block, 0, stream>>>(
+        x_data, out_data, numel, dim_value.mutable_data<T>(context.GetPlace()),
+        dimension_each, dim_divisor);
+    // platform::GpuStreamSync(stream);
+  }
+};
+
+template <typename T>
+class CUDAGradRenormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor* d_out =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
+    framework::Tensor* d_x =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto numel = d_out->numel();
+    const T* dout_data = d_out->data<T>();
+    const T* x_data = x->data<T>();
+    auto input_dims = x->dims();
+    float max_norm = ctx.Attr<float>("max_norm");
+    float p = ctx.Attr<float>("p");
+    int dim = ctx.Attr<int>("axis");
+    auto dimension_each = input_dims[dim];
+    auto dim_size = input_dims.size();
+    int64_t dim_divisor = 1, pre_mul = 1;
+    for (int i = dim + 1; i < dim_size; i++) dim_divisor *= input_dims[i];
+    for (int i = 0; i < dim; i++) pre_mul *= input_dims[i];
+    d_x->Resize(framework::make_ddim(framework::vectorize(input_dims)));
+    T* dx_data = d_x->mutable_data<T>(ctx.GetPlace());
+    framework::Tensor pow_value, mul_value, dim_value, dim_power_sum,
+        weight_derivative;
+    pow_value.Resize(
+        framework::make_ddim({pre_mul, dimension_each, dim_divisor}));
+    mul_value.Resize(
+        framework::make_ddim({pre_mul, dimension_each, dim_divisor}));
+    dim_value.Resize(framework::make_ddim({dimension_each}));
+    dim_power_sum.Resize(framework::make_ddim({dimension_each}));
+    weight_derivative.Resize(framework::make_ddim({dimension_each}));
+    auto stream = ctx.cuda_device_context().stream();
+    int block = std::min(numel, static_cast<int64_t>(256));
+    int grid = (numel + block - 1) / block;
+    pow_value.mutable_data<T>(ctx.GetPlace());
+    mul_value.mutable_data<T>(ctx.GetPlace());
+    dim_value.mutable_data<T>(ctx.GetPlace());
+    dim_power_sum.mutable_data<T>(ctx.GetPlace());
+    weight_derivative.mutable_data<T>(ctx.GetPlace());
+    RenormGradKernelFunc1<T><<<grid, block, 0, stream>>>(
+        x_data, dout_data, pow_value.mutable_data<T>(ctx.GetPlace()),
+        mul_value.mutable_data<T>(ctx.GetPlace()), numel, dimension_each, p,
+        dim_divisor);
+    std::vector<int> reduce_axis = {0, 2};
+    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, stream);
+    TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        mul_value, &weight_derivative, kps::IdentityFunctor<T>(), reduce_axis,
+        stream);
+    RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
+        x_data, dout_data, dx_data, numel,
+        dim_value.mutable_data<T>(ctx.GetPlace()),
+        dim_power_sum.mutable_data<T>(ctx.GetPlace()),
+        weight_derivative.mutable_data<T>(ctx.GetPlace()), dimension_each, p,
+        max_norm, dim_divisor);
+    // platform::GpuStreamSync(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(renorm, ops::CUDARenormKernel<float>,
+                        ops::CUDARenormKernel<double>);
+
+REGISTER_OP_CUDA_KERNEL(renorm_grad, ops::CUDAGradRenormKernel<float>,
+                        ops::CUDAGradRenormKernel<double>);
diff --git a/paddle/fluid/operators/renorm_op.h b/paddle/fluid/operators/renorm_op.h
new file mode 100644
index 0000000000000..461f383ad2563
--- /dev/null
+++ b/paddle/fluid/operators/renorm_op.h
@@ -0,0 +1,191 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "math.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/platform/for_range.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+// template <typename T>
+// struct NormDimValueFunctor<T> {
+//   NormDimValueFunctor(T* input, T* output, int64_t dim_divisor, int64_t
+//   dimension_each, float p)
+//       : input_(input), output_(output),dim_divisor_(dim_divisor),
+//       dimension_each_(dimension_each),p_(p) {}
+
+//   HOSTDEVICE void operator()(int64_t i) const {
+//       auto dim_index = i / dim_divsor % dimension_each;
+//       dim_value[dim_index] += std::pow(std::abs(input[i]), p);
+//   }
+
+//   T* input_;
+//   T* output_;
+//   int64_t dimension_each_, dim_divisor_;
+//   float p_,max_norm_;
+
+// };
+// template <typename DeviceContext, typename T>
+template <typename T>
+class CPURenormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    auto numel = x->numel();
+    auto* x_data = x->data<T>();
+    auto input_dims = x->dims();
+    float max_norm = context.Attr<float>("max_norm");
+    float p = context.Attr<float>("p");
+    int dim = context.Attr<int>("axis");
+    auto dimension_each = input_dims[dim];
+    auto dim_size = input_dims.size();
+    int64_t dim_divisor = 1;
+    for (int i = dim + 1; i < dim_size; i++) dim_divisor *= input_dims[i];
+
+    // auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    // std::vector<int64_t> dim_index(dim_size, 0);
+    std::vector<T> dim_value(dimension_each,
+                             0);  // dim_value = (x1^p + x2^p + x3^p....)^(1/p)
+
+    auto* out_data =
+        out->mutable_data<T>(context.GetPlace(), size_t(numel * sizeof(T)));
+
+    int64_t index = 0, dim_index = 0;
+    for (int64_t i = 0; i < numel; i++) {
+      // auto dim_index = i / dim_divsor % dimension_each;
+      dim_value[dim_index] += std::pow(std::abs(x_data[i]), p);
+      index++;
+      if (index == dim_divisor) {
+        dim_index++;
+        if (dim_index == dimension_each) {
+          dim_index = 0;
+        }
+        index = 0;
+      }
+    }
+    for (int64_t i = 0; i < dimension_each; i++) {
+      dim_value[i] = std::pow(dim_value[i], 1.0 / p);
+      if (dim_value[i] > max_norm)
+        dim_value[i] = max_norm / dim_value[i];
+      else
+        dim_value[i] = 1.0;
+      // dim_index[i] = 0;
+    }
+    index = dim_index = 0;
+    for (int64_t i = 0; i < numel; i++) {
+      // auto dim_index = i / dim_divsor % dimension_each;
+      out_data[i] = dim_value[dim_index] < 1.0
+                        ? dim_value[dim_index] * x_data[i]
+                        : x_data[i];
+      index++;
+      if (index == dim_divisor) {
+        dim_index++;
+        if (dim_index == dimension_each) {
+          dim_index = 0;
+        }
+        index = 0;
+      }
+    }
+  }
+};
+
+// template <typename DeviceContext, typename T>
+template <typename T>
+class CPURenormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const framework::Tensor* d_out =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
+    framework::Tensor* d_x =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto numel = d_out->numel();
+    auto* dout_data = d_out->data<T>();
+    auto* x_data = x->data<T>();
+    auto input_dims = x->dims();
+    float max_norm = ctx.Attr<float>("max_norm");
+    float p = ctx.Attr<float>("p");
+    int dim = ctx.Attr<int>("axis");
+    auto dimension_each = input_dims[dim];
+    auto dim_size = input_dims.size();
+    int64_t dim_divisor = 1;
+    for (int i = dim + 1; i < dim_size; i++) dim_divisor *= input_dims[i];
+    auto* dx_data = d_x->mutable_data<T>(
+        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
+    std::vector<T> dim_value(dimension_each, 0),
+        dim_power_sum(dimension_each, 0),
+        weight_derivative(dimension_each, 0.0);
+    int64_t index = 0, dim_index = 0;
+    for (int64_t i = 0; i < numel; i++) {
+      // auto dim_index = i / dim_divsor % dimension_each;
+      dim_value[dim_index] += std::pow(std::abs(x_data[i]), p);
+      index++;
+      if (index == dim_divisor) {
+        dim_index++;
+        if (dim_index == dimension_each) {
+          dim_index = 0;
+        }
+        index = 0;
+      }
+    }
+    for (int64_t i = 0; i < dimension_each; i++) {
+      auto temp = std::pow(dim_value[i], 1.0 / p);
+      if (temp > max_norm) {
+        dim_power_sum[i] =
+            std::pow(dim_value[i], (T)(-1.0 - 1.0 / p)) * -1 * max_norm;
+        dim_value[i] = max_norm / temp;
+      } else
+        dim_value[i] = 1.0;
+    }
+    index = dim_index = 0;
+    for (int64_t i = 0; i < numel; i++) {
+      // auto dim_index = i / dim_divsor % dimension_each;
+      dx_data[i] = dim_value[dim_index] * dout_data[i];
+      weight_derivative[dim_index] += x_data[i] * dout_data[i];
+      index++;
+      if (index == dim_divisor) {
+        dim_index++;
+        if (dim_index == dimension_each) {
+          dim_index = 0;
+        }
+        index = 0;
+      }
+    }
+    index = dim_index = 0;
+    for (int64_t i = 0; i < numel; i++) {
+      // auto dim_index = i / dim_divsor % dimension_each;
+      dx_data[i] += weight_derivative[dim_index] * dim_power_sum[dim_index] *
+                    std::pow(std::abs(x_data[i]), p - 1.0) *
+                    (x_data[i] >= 0 ? 1 : -1);
+      index++;
+      if (index == dim_divisor) {
+        dim_index++;
+        if (dim_index == dimension_each) {
+          dim_index = 0;
+        }
+        index = 0;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 856f4020cfcf6..f2162f55636e5 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/reshape_kernel.h"
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -385,8 +385,7 @@ class ReshapeKernel {
     // We can't MakePtenDenseTensor for case 2, so we solve this case by
     // creating a temporary tensor here:
     pten::DenseTensorMeta meta{pten::TransToPtenDataType(in->type()),
-                               in->dims(),
-                               pten::TransToPtenDataLayout(in->layout())};
+                               in->dims(), in->layout()};
     auto pt_out_tmp = std::make_shared<pten::DenseTensor>(
         pten::make_intrusive<paddle::experimental::SharedStorage>(
             ctx.GetPlace()),
@@ -439,25 +438,24 @@ class ReshapeKernel {
     }
     if (platform::is_cpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
-      pten::Reshape(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
     }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
-      pten::Reshape(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
     }
 #endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<platform::XPUDeviceContext>();
-      pten::Reshape(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
+      pten::ReshapeKernel(dev_ctx, *pt_x.get(), pt_scalar_shape, pt_out);
     }
 #endif
     // non-inplace need move all result from pt_out to out, inplace need set
     // result dims.
     if (in != out) {
-      paddle::experimental::MovesSharedStorage(pt_out,
-                                               static_cast<Tensor *>(out));
+      paddle::experimental::SharesStorage(pt_out, static_cast<Tensor *>(out));
     } else {
       out->Resize(pt_out->dims());
     }
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 80758e1718be4..ec62feb07bc80 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -153,6 +153,31 @@ class RunProgramGradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+struct FilterHelper {};
+
+template <>
+struct FilterHelper<imperative::OpBase> {
+  static void filter(const BlockDesc* desc,
+                     imperative::TracedVarList<imperative::VarBase,
+                                               imperative::kBackward>* vec) {
+    auto f = [desc](std::shared_ptr<imperative::VarBase> ptr) {
+      return !desc->HasVar(ptr->Name());
+    };
+    auto new_end = std::remove_if(vec->begin(), vec->end(), f);
+    vec->resize(new_end - vec->begin());
+  }
+};
+
+template <>
+struct FilterHelper<framework::OpDesc> {
+  static void filter(const BlockDesc* desc, std::vector<std::string>* vec) {
+    auto f = [desc](const std::string& name) { return !desc->HasVar(name); };
+    auto new_end = std::remove_if(vec->begin(), vec->end(), f);
+    vec->resize(new_end - vec->begin());
+  }
+};
+
 template <typename T>
 class RunProgramGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -167,8 +192,12 @@ class RunProgramGradOpMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetInput("OutScope", this->Output("OutScope"));
     grad_op->SetInput("DOut", this->Output("DOut"));
     grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetOutput(framework::GradVarName("Params"),
-                       this->InputGrad("Params"));
+
+    auto block_desc =
+        BOOST_GET_CONST(BlockDesc*, this->GetAttr("global_block"));
+    auto params_grad = this->InputGrad("Params");
+    FilterHelper<T>::filter(block_desc, &params_grad);  // filter the vector.
+    grad_op->SetOutput(framework::GradVarName("Params"), params_grad);
     grad_op->SetAttrMap(this->Attrs());
   }
 };
diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc
new file mode 100644
index 0000000000000..fadf063bc5bd6
--- /dev/null
+++ b/paddle/fluid/operators/scatter_op_xpu.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ScatterOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Ids");
+    auto *updates = ctx.Input<Tensor>("Updates");
+    auto *out = ctx.Output<Tensor>("Out");
+    bool overwrite = ctx.Attr<bool>("overwrite");
+
+    // In place output: Out = X, Out[ids] = Updates
+    framework::TensorCopy(*x, ctx.GetPlace(), out);
+    // Apply ScatterUpdate: Out[index] = Updates[:]
+    const auto &index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+                            index_type == framework::proto::VarType::INT64;
+    PADDLE_ENFORCE_EQ(index_type_match, true,
+                      platform::errors::InvalidArgument(
+                          "Index holds the wrong type, it holds [%s],"
+                          "but desires to be [%s] or [%s].",
+                          paddle::framework::DataTypeToString(index_type),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT32),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT64)));
+
+    // check index of shape 1-D
+    PADDLE_ENFORCE_EQ(
+        index->dims().size() == 1 ||
+            (index->dims().size() == 2 && index->dims()[1] == 1),
+        true, platform::errors::InvalidArgument(
+                  "index's shape is error, "
+                  "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+                  "but got index'dims shape is %d",
+                  index->dims().size()));
+
+    int index_size = static_cast<int>(index->dims()[0]);
+    auto x_dims = x->dims();
+    auto update_dims = updates->dims();
+    for (int i = 1; i < x_dims.size(); i++)
+      PADDLE_ENFORCE_EQ(
+          x_dims[i], update_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimensions of the source tensor and target tensor should"
+              " match, but received source tensor's %d-th dimension is %d,"
+              "target tensor's %d-th dimension is %d.",
+              i, x_dims[i], i, update_dims[i]));
+
+    int dim0 = static_cast<int>(x->dims()[0]);
+    int dim1 = static_cast<int>(
+        framework::product(framework::slice_ddim(x_dims, 1, x_dims.size())));
+    T *out_data = out->data<T>();
+    const T *updates_data = updates->data<T>();
+
+    auto &dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    int r = XPU_SUCCESS;
+
+    Tensor indices_cpu(index->type());
+    framework::TensorCopy(*index, platform::CPUPlace(), &indices_cpu);
+
+    if (index_type == framework::proto::VarType::INT32) {
+      auto index_data = const_cast<int *>(index->data<int>());
+      xpu::VectorParam<int> indices{indices_cpu.data<int>(), index_size,
+                                    index_data};
+      r = xpu::scatter(dev_ctx.x_context(), updates_data, out_data, indices,
+                       dim0, dim1, overwrite);
+    } else {
+      auto index_data = const_cast<int64_t *>(index->data<int64_t>());
+      xpu::VectorParam<int64_t> indices{indices_cpu.data<int64_t>(), index_size,
+                                        index_data};
+      r = xpu::scatter(dev_ctx.x_context(), updates_data, out_data, indices,
+                       dim0, dim1, overwrite);
+    }
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU scatter kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(scatter, ops::ScatterOpXPUKernel<float>,
+                       ops::ScatterOpXPUKernel<int64_t>);
+#endif
diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h
index f05af3f249ce0..bd24bbeb9f047 100644
--- a/paddle/fluid/operators/shuffle_batch_op.h
+++ b/paddle/fluid/operators/shuffle_batch_op.h
@@ -33,13 +33,9 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 template <typename T>
 using Vector = framework::Vector<T>;
-#else
-template <typename T>
-using Vector = framework::CPUVector<T>;
-#endif
 
 template <typename T>
 class ShuffleBatchKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
new file mode 100644
index 0000000000000..7e21cba14b7dc
--- /dev/null
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("This kernel only runs on XPU."));
+
+    // input and output data
+    auto* input = context.Input<Tensor>("X");
+    auto* label = context.Input<Tensor>("Label");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    // attrs
+    bool normalize = context.Attr<bool>("normalize");
+    PADDLE_ENFORCE_EQ(
+        normalize, false,
+        platform::errors::InvalidArgument("normalize only support true now."));
+    int ignore_index = context.Attr<int>("ignore_index");
+    PADDLE_ENFORCE_EQ(ignore_index, kIgnoreIndex,
+                      platform::errors::InvalidArgument(
+                          "ignore_index only support %d now.", kIgnoreIndex));
+
+    int r = xpu::sigmoid_cross_entropy_with_logits(
+        dev_ctx.x_context(), reinterpret_cast<const XPUType*>(input->data<T>()),
+        reinterpret_cast<const XPUType*>(label->data<T>()),
+        reinterpret_cast<XPUType*>(output->data<T>()), 1, input->numel());
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU sigmoid_cross_entropy_with_logits "
+                                   "kernel return wrong value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SigmoidCrossEntropyWithLogitsGradXPUKernel
+    : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("This kernel only runs on XPU."));
+
+    // input and output data
+    auto* input = context.Input<Tensor>("X");
+    auto* label = context.Input<Tensor>("Label");
+    auto* dy = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    int r = xpu::sigmoid_cross_entropy_with_logits_grad(
+        dev_ctx.x_context(), reinterpret_cast<const XPUType*>(input->data<T>()),
+        reinterpret_cast<const XPUType*>(label->data<T>()),
+        reinterpret_cast<const XPUType*>(dy->data<T>()),
+        reinterpret_cast<XPUType*>(dx->data<T>()), 1, input->numel());
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU sigmoid_cross_entropy_with_logits_grad "
+                                   "kernel return wrong value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(sigmoid_cross_entropy_with_logits,
+                       ops::SigmoidCrossEntropyWithLogitsXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+
+REGISTER_OP_XPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                       ops::SigmoidCrossEntropyWithLogitsGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index b7a46cc546797..b93c062cda200 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -19,9 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-// only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/sign_kernel.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index b5edc1dda533b..64751a21c837d 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -587,15 +587,13 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
                                   collapsed_input_conj.data<Ti>());
     for_range(functor);
     MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
-        desc.get(), collapsed_input_conj.data<void>(),
-        collapsed_output.data<void>()));
+        desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
   } else if (fft_type == FFTTransformType::R2C && !forward) {
     framework::Tensor collapsed_output_conj(collapsed_output.type());
     collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
                                            ctx.GetPlace());
     MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
-        desc.get(), collapsed_input.data<void>(),
-        collapsed_output_conj.data<void>()));
+        desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
     // conjugate the output
     platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
     math::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
@@ -605,12 +603,10 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
   } else {
     if (forward) {
       MKL_DFTI_CHECK(platform::dynload::DftiComputeForward(
-          desc.get(), collapsed_input.data<void>(),
-          collapsed_output.data<void>()));
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
     } else {
       MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward(
-          desc.get(), collapsed_input.data<void>(),
-          collapsed_output.data<void>()));
+          desc.get(), collapsed_input.data(), collapsed_output.data()));
     }
   }
 
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index 2066ce955cafe..d6a775dd55de8 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -115,22 +115,19 @@ void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
     math::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
                                   input_conj.data<Ti>());
     for_range(functor);
-    exec_cufft_plan_raw(config, input_conj.data<void>(), output->data<void>(),
-                        forward);
+    exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
   } else if (fft_type == FFTTransformType::R2C && !forward) {
     forward = true;
     framework::Tensor out_conj(output->type());
     out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_cufft_plan_raw(config, input->data<void>(), out_conj.data<void>(),
-                        forward);
+    exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
 
     platform::ForRange<DeviceContext> for_range(ctx, output->numel());
     math::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
                                   output->data<To>());
     for_range(functor);
   } else {
-    exec_cufft_plan_raw(config, input->data<void>(), output->data<void>(),
-                        forward);
+    exec_cufft_plan_raw(config, input->data(), output->data(), forward);
   }
 }
 
@@ -227,22 +224,19 @@ void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
     math::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
                                   input_conj.data<Ti>());
     for_range(functor);
-    exec_hipfft_plan_raw(config, input_conj.data<void>(), output->data<void>(),
-                         forward);
+    exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
   } else if (fft_type == FFTTransformType::R2C && !forward) {
     forward = true;
     framework::Tensor out_conj(output->type());
     out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
-    exec_hipfft_plan_raw(config, input->data<void>(), out_conj.data<void>(),
-                         forward);
+    exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
 
     platform::ForRange<DeviceContext> for_range(ctx, output->numel());
     math::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
                                   output->data<To>());
     for_range(functor);
   } else {
-    exec_hipfft_plan_raw(config, input->data<void>(), output->data<void>(),
-                         forward);
+    exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
   }
 }
 
diff --git a/paddle/fluid/operators/split_op_xpu.cc b/paddle/fluid/operators/split_op_xpu.cc
new file mode 100644
index 0000000000000..bd8c6914876da
--- /dev/null
+++ b/paddle/fluid/operators/split_op_xpu.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/split_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SplitXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto output = ctx.MultiOutput<framework::Tensor>("Out");
+    int num = ctx.Attr<int>("num");
+    std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
+    int axis = ctx.Attr<int>("axis");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto in_dims = input->dims();
+
+    auto input_shape = framework::vectorize<int>(in_dims);
+    std::vector<int> split_lists;
+    std::vector<T*> out_ptrs;
+    auto outs_number = output.size();
+    std::vector<framework::DDim> outs_dims =
+        UpdateOutsDims(true, true, in_dims, num, sections, axis, outs_number);
+    for (size_t i = 0; i < output.size(); ++i) {
+      output[i]->Resize(outs_dims[i]);
+      out_ptrs.push_back(output[i]->mutable_data<T>(ctx.GetPlace()));
+      split_lists.push_back(output[i]->dims()[axis]);
+    }
+
+    int r = xpu::split<T>(dev_ctx.x_context(), input->data<T>(), out_ptrs,
+                          input_shape, split_lists, axis);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU split kernel return wrong value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    split, ops::SplitXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::SplitXPUKernel<paddle::platform::XPUDeviceContext, int>);
+#endif
diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc
new file mode 100644
index 0000000000000..fef5d10f2da00
--- /dev/null
+++ b/paddle/fluid/operators/take_along_axis_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/take_along_axis_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class TakeAlongAxisOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Input"), true,
+        platform::errors::InvalidArgument(
+            "Input(Input) of TakeAlongAxisOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Index"), true,
+        platform::errors::InvalidArgument(
+            "Input(Index) of TakeAlongAxisOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("Result"), true,
+        platform::errors::InvalidArgument(
+            "Output(Result) of TakeAlongAxisOp should not be null."));
+
+    auto input_dim = ctx->GetInputDim("Input");
+    auto index_dim = ctx->GetInputDim("Index");
+
+    PADDLE_ENFORCE_GT(input_dim.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Dimension of the input(Input) of TakeAlongAxisOp "
+                          "should be greater than 0.",
+                          input_dim));
+
+    PADDLE_ENFORCE_GT(index_dim.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Dimension of the input(Index) of TakeAlongAxisOp "
+                          "should be greater than 0.",
+                          index_dim));
+
+    ctx->SetOutputDim("Result", index_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+        ctx.device_context());
+  }
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class TakeAlongAxisOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "The input tensor of TakeAlongAxisOp");
+    AddInput("Index", "The index tensor of TakeAlongAxisOp");
+    AddOutput("Result", "The result tensor of TakeAlongAxisOp");
+    AddAttr<int>("Axis",
+                 "The Tensor which contains the axis that we do TakeAlongAxis "
+                 "operation.");
+    AddComment(R"DOC(
+        Take_along_axis Operator.)
+    )DOC");
+  }
+};
+
+class TakeAlongAxisGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("Input"),
+                      ctx->GetInputDim("Input"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Result")),
+                                   ctx.device_context());
+  }
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class TakeAlongAxisGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("take_along_axis_grad");
+    op->SetInput("Index", this->Input("Index"));
+    op->SetInput("Input", this->Input("Input"));
+
+    op->SetInput(framework::GradVarName("Result"), this->OutputGrad("Result"));
+    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp,
+                  ops::TakeAlongAxisOpMaker,
+                  ops::TakeAlongAxisGradOpMaker<paddle::framework::OpDesc>,
+                  ops::TakeAlongAxisGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp);
+
+REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel<float>,
+                       ops::TakeAlongAxisOpKernel<double>,
+                       ops::TakeAlongAxisOpKernel<int>,
+                       ops::TakeAlongAxisOpKernel<uint8_t>,
+                       ops::TakeAlongAxisOpKernel<int64_t>);
+
+REGISTER_OP_CPU_KERNEL(take_along_axis_grad,
+                       ops::TakeAlongAxisGradOpKernel<float>,
+                       ops::TakeAlongAxisGradOpKernel<double>,
+                       ops::TakeAlongAxisGradOpKernel<int>,
+                       ops::TakeAlongAxisGradOpKernel<uint8_t>,
+                       ops::TakeAlongAxisGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu
new file mode 100644
index 0000000000000..e9f9b18718787
--- /dev/null
+++ b/paddle/fluid/operators/take_along_axis_op.cu
@@ -0,0 +1,97 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/take_along_axis_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class TakeAlongAxisCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::PreconditionNotMet(
+                          "This kernel only runs on GPU device."));
+    auto input = ctx.Input<Tensor>("Input");
+    auto axis = ctx.Attr<int>("Axis");
+    auto index = ctx.Input<Tensor>("Index");
+    auto result = ctx.Output<Tensor>("Result");
+    result->Resize(index->dims());
+    result->mutable_data<T>(ctx.GetPlace());
+    const auto &index_type = index->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      gpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
+                                    ctx.device_context());
+    } else if (index_type == framework::proto::VarType::INT64) {
+      gpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
+                                    ctx.device_context());
+    }
+  }
+};
+
+template <typename T>
+class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("This kernel only runs on GPU."));
+
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto index = ctx.Input<Tensor>("Index");
+    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
+    auto axis = ctx.Attr<int>("Axis");
+    // We need to know the shape of input matrix to determine the shape of grad
+    // matrix of input.
+    auto input = ctx.Input<Tensor>("Input");
+    input_grad->Resize(input->dims());
+    input_grad->mutable_data<T>(ctx.GetPlace());
+
+    // Set to zero tensor.
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
+            input_grad, static_cast<T>(0));
+    const auto &index_type = index->type();
+
+    if (index_type == framework::proto::VarType::INT32) {
+      gpu_scatter_add_kernel<T, int32_t>(
+          *input_grad, axis, *index, *result_grad,
+          ctx.device_context());  // the gradient of gather is scatter
+    } else if (index_type == framework::proto::VarType::INT64) {
+      gpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
+                                         *result_grad, ctx.device_context());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel<float>,
+                        ops::TakeAlongAxisCUDAKernel<double>,
+                        ops::TakeAlongAxisCUDAKernel<int64_t>,
+                        ops::TakeAlongAxisCUDAKernel<int>,
+                        ops::TakeAlongAxisCUDAKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(take_along_axis_grad,
+                        ops::TakeAlongAxisGradOpCUDAKernel<float>,
+                        ops::TakeAlongAxisGradOpCUDAKernel<double>,
+                        ops::TakeAlongAxisGradOpCUDAKernel<int64_t>,
+                        ops::TakeAlongAxisGradOpCUDAKernel<int>,
+                        ops::TakeAlongAxisGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h
new file mode 100644
index 0000000000000..580ca528ceb32
--- /dev/null
+++ b/paddle/fluid/operators/take_along_axis_op.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/gather_scatter_kernel.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class TakeAlongAxisOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+    auto input = ctx.Input<Tensor>("Input");
+    auto axis = ctx.Attr<int>("Axis");
+    auto index = ctx.Input<Tensor>("Index");
+    auto result = ctx.Output<Tensor>("Result");
+    result->Resize(index->dims());
+    result->mutable_data<T>(ctx.GetPlace());
+
+    const auto &index_type = index->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      cpu_gather_kernel<T, int32_t>(*input, axis, *index, *result,
+                                    ctx.device_context());
+    } else if (index_type == framework::proto::VarType::INT64) {
+      cpu_gather_kernel<T, int64_t>(*input, axis, *index, *result,
+                                    ctx.device_context());
+    }
+  }
+};
+
+template <typename T>
+class TakeAlongAxisGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto index = ctx.Input<Tensor>("Index");
+    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
+    auto axis = ctx.Attr<int>("Axis");
+    // We need to know the shape of input matrix to determine the shape of grad
+    // matrix of input.
+    auto input = ctx.Input<Tensor>("Input");
+    input_grad->Resize(input->dims());
+    input_grad->mutable_data<T>(ctx.GetPlace());
+
+    // Set to zero tensor.
+    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> functor;
+    functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
+            input_grad, static_cast<T>(0));
+
+    const auto &index_type = index->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      cpu_scatter_add_kernel<T, int32_t>(
+          *input_grad, axis, *index, *result_grad,
+          ctx.device_context());  // the gradient of gather is scatter
+    } else if (index_type == framework::proto::VarType::INT64) {
+      cpu_scatter_add_kernel<T, int64_t>(*input_grad, axis, *index,
+                                         *result_grad, ctx.device_context());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 35612905f8569..5ebf67587f3cb 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -250,6 +250,23 @@ class TensorRTEngineOp : public framework::OperatorBase {
     }
   }
 
+  void PrepareTRTEngine(const framework::Scope &scope,
+                        TensorRTEngine *engine) const {
+    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                 "kernel etc). This process may cost a lot of time.";
+    framework::proto::BlockDesc block_proto;
+    block_proto.ParseFromString(Attr<std::string>("subgraph"));
+    framework::BlockDesc block_desc(nullptr, &block_proto);
+
+    std::vector<std::string> inputs = Inputs("Xs");
+    std::vector<std::string> outputs =
+        Attr<std::vector<std::string>>("output_name_mapping");
+
+    inference::Singleton<inference::tensorrt::OpConverter>::Global()
+        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
+                                 outputs, engine);
+  }
+
  protected:
   void RunNativeImpl(const framework::Scope &scope,
                      const platform::Place &dev_place) const {
@@ -388,7 +405,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       if (param_names_.count(x)) continue;
       auto &t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
-      calib_data.emplace(x, t.data<void>());
+      calib_data.emplace(x, t.data());
     }
     temp_calibrator->setBatch(calib_data);
     RunNativeImpl(scope, dev_place);
@@ -414,8 +431,19 @@ class TensorRTEngineOp : public framework::OperatorBase {
     int num_inputs = 0;
 
     num_inputs += runtime_input_names_.size();
-    const int num_bindings = num_inputs + Outputs("Ys").size();
-    std::vector<void *> buffers(num_bindings);
+    //  const int num_bindings = num_inputs + Outputs("Ys").size();
+    //  std::vector<void *> buffers(num_bindings);
+    // This method returns the total over all profiles.
+    const int num_bindings = engine->GetNbBindings();
+    std::vector<void *> buffers(num_bindings, nullptr);
+
+    int binding_offset = 0;
+    nvinfer1::IExecutionContext *trt_context = nullptr;
+    if (engine->with_dynamic_shape()) {
+      // Initilize context and get offset by profile index
+      trt_context = engine->context();
+      binding_offset = engine->GetBindingsOffset();
+    }
 
     // Bind input tensor to TRT.
     for (const auto &x : runtime_input_names_) {
@@ -430,7 +458,10 @@ class TensorRTEngineOp : public framework::OperatorBase {
         t.ShareDataWith(out);
       }
       auto t_shape = framework::vectorize<int64_t>(t.dims());
-      const int bind_index = engine->engine()->getBindingIndex(x.c_str());
+      // const int bind_index = engine->engine()->getBindingIndex(x.c_str());
+      // Get index of profile 0 first, then plus binding offset
+      const int bind_index =
+          engine->engine()->getBindingIndex(x.c_str()) + binding_offset;
       PADDLE_ENFORCE_LT(
           bind_index, num_bindings,
           platform::errors::InvalidArgument(
@@ -474,7 +505,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
-        auto *trt_context = engine->context();
         trt_context->setBindingDimensions(
             bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
 #endif
@@ -500,7 +530,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
     VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto &y : Outputs("Ys")) {
       const int bind_index =
-          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
+          engine->engine()->getBindingIndex(output_maps[output_index].c_str()) +
+          binding_offset;
       std::vector<int> ddim;
 
       if (!engine->with_dynamic_shape()) {
@@ -511,7 +542,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
-        auto *trt_context = engine->context();
         auto dims = trt_context->getBindingDimensions(bind_index);
         int nb_dims = dims.nbDims;
         for (; nb_dims > 0; nb_dims--) {
@@ -583,23 +613,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
     }
     return trt_engine_;
   }
-
-  void PrepareTRTEngine(const framework::Scope &scope,
-                        TensorRTEngine *engine) const {
-    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
-                 "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_proto;
-    block_proto.ParseFromString(Attr<std::string>("subgraph"));
-    framework::BlockDesc block_desc(nullptr, &block_proto);
-
-    std::vector<std::string> inputs = Inputs("Xs");
-    std::vector<std::string> outputs =
-        Attr<std::vector<std::string>>("output_name_mapping");
-
-    inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
-                                 outputs, engine);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc
new file mode 100644
index 0000000000000..141a0ede4f8b0
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc
@@ -0,0 +1,198 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+
+#include "paddle/fluid/operators/top_k_op.h"
+#include "paddle/fluid/operators/transpose_op.h"
+#include "xpu/refactor/math.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+class TopkV2XPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+    const auto& in_dims = input->dims();
+    const T* in_data = input->data<T>();
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    const auto& out_dims = output->dims();
+
+    const auto& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
+    const auto& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
+    PADDLE_ENFORCE_EQ(
+        sorted, true,
+        platform::errors::External(
+            "XPU API does not support unsorted topk operation currently."
+            " Operator will be supported in future update."));
+    PADDLE_ENFORCE_EQ(
+        largest, true,
+        platform::errors::External(
+            "XPU API does not support smallest topk operation currently."
+            " Operator will be supported in future update."));
+
+    int axis = static_cast<int>(ctx.Attr<int>("axis"));
+    if (axis < 0) axis += in_dims.size();
+
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+    auto* k_t = ctx.Input<Tensor>("K");
+    if (k_t) {
+      k = k_t->data<int>()[0];
+      framework::DDim output_dims = output->dims();
+      output_dims[axis] = k;
+      output->Resize(output_dims);
+      indices->Resize(output_dims);
+    }
+    if (axis + 1 == in_dims.size()) {
+      auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      int32_t* indices_int_data =
+          RAII_GUARD.alloc_l3_or_gm<int32_t>(indices->numel());
+
+      const size_t row = framework::product(
+          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+      const size_t col = in_dims[in_dims.size() - 1];
+      int r = xpu::sorted_topk<T>(dev_ctx.x_context(), in_data, output_data,
+                                  indices_int_data, row, col, k);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d %s] in call kernel name "
+              "[%s], please check "
+              "where Baidu Kunlun Card is properly installed.",
+              r, XPUAPIErrorMsg[r], "sorted_topk"));
+      r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
+                                         (const int32_t*)indices_int_data,
+                                         indices_data, indices->numel());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d %s] in call kernel name "
+              "[%s], please check "
+              "where Baidu Kunlun Card is properly installed.",
+              r, XPUAPIErrorMsg[r], "cast_v2"));
+
+    } else {
+      // do transpose if axis is not the last dim of input
+      std::vector<int> trans_axes;
+      for (int i = 0; i < axis; i++) {
+        trans_axes.emplace_back(i);
+      }
+      for (int i = axis + 1; i < in_dims.size(); i++) {
+        trans_axes.emplace_back(i);
+      }
+      trans_axes.emplace_back(axis);
+      // Get input and output dims for transpose
+      framework::DDim trans_dims(in_dims);
+      framework::DDim trans_out_dims(output->dims());
+      for (size_t i = 0; i < trans_axes.size(); i++) {
+        trans_dims[i] = in_dims[trans_axes[i]];
+        trans_out_dims[i] = out_dims[trans_axes[i]];
+      }
+
+      std::vector<int> x_shape_host(in_dims.size(), 0);
+      for (int i = 0; i < in_dims.size(); ++i) {
+        x_shape_host[i] = in_dims[i];
+      }
+
+      auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      T* trans_in_data = RAII_GUARD.alloc_l3_or_gm<T>(input->numel());
+
+      // Transpose and save interval output to trans_in
+      int r = xpu::transpose<T>(dev_ctx.x_context(), in_data, trans_in_data,
+                                x_shape_host, trans_axes);
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External("XPU API 1st Transpose kernel"
+                                     " returns wrong value[%d %s]!",
+                                     r, XPUAPIErrorMsg[r]));
+
+      T* trans_out_data = RAII_GUARD.alloc_l3_or_gm<T>(output->numel());
+      int64_t* trans_idx_data =
+          RAII_GUARD.alloc_l3_or_gm<int64_t>(output->numel());
+      int32_t* trans_idx_int32_data =
+          RAII_GUARD.alloc_l3_or_gm<int32_t>(output->numel());
+      const size_t row = framework::product(
+          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
+      const size_t col = trans_dims[trans_dims.size() - 1];
+
+      // Do top k on transposed input
+      r = xpu::sorted_topk<T>(dev_ctx.x_context(), trans_in_data,
+                              trans_out_data, trans_idx_int32_data, row, col,
+                              k);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d %s] in call kernel name "
+              "[%s], please check "
+              "where Baidu Kunlun Card is properly installed.",
+              r, XPUAPIErrorMsg[r], "sorted_topk"));
+
+      r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
+                                         (const int32_t*)trans_idx_int32_data,
+                                         trans_idx_data, indices->numel());
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d %s in call kernel name "
+              "[%s], please check "
+              "where Baidu Kunlun Card is properly installed.",
+              r, XPUAPIErrorMsg[r], "cast_v2"));
+
+      // Transpose back to original dims
+      std::vector<int> trans_back_axes;
+      for (int i = 0; i < axis; i++) {
+        trans_axes.emplace_back(i);
+      }
+      trans_axes.emplace_back(trans_out_dims.size() - 1);
+      for (int i = axis; i < trans_out_dims.size() - 1; i++) {
+        trans_axes.emplace_back(i);
+      }
+
+      std::vector<int> trans_out_shape_host(trans_back_axes.size(), 0);
+      for (size_t i = 0; i < trans_back_axes.size(); ++i) {
+        trans_out_shape_host[i] = trans_out_dims[i];
+      }
+      r = xpu::transpose<T>(dev_ctx.x_context(), trans_out_data, output_data,
+                            trans_out_shape_host, trans_back_axes);
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External("XPU API 2nd Transpose kernel"
+                                     " returns wrong value[%d %s]",
+                                     r, XPUAPIErrorMsg[r]));
+      r = xpu::transpose<int64_t>(dev_ctx.x_context(), trans_idx_data,
+                                  indices_data, trans_out_shape_host,
+                                  trans_back_axes);
+      PADDLE_ENFORCE_EQ(
+          r, xpu::Error_t::SUCCESS,
+          platform::errors::External("XPU API 3rd Transpose kernel"
+                                     " returns wrong value[%d %s]",
+                                     r, XPUAPIErrorMsg[r]));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(top_k_v2, ops::TopkV2XPUKernel<float>);
+#endif
diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc
index 994aabd66cf93..bf3a985923f87 100644
--- a/paddle/fluid/operators/transfer_layout_op.cc
+++ b/paddle/fluid/operators/transfer_layout_op.cc
@@ -40,7 +40,7 @@ class TransferLayoutOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TransferLayout");
 
     auto dst_layout = ctx->Attrs().Get<int>("dst_layout");
-    auto low_bound = static_cast<int>(framework::DataLayout::kNHWC);
+    auto low_bound = static_cast<int>(framework::DataLayout::kAnyLayout);
     auto upper_bound = static_cast<int>(framework::DataLayout::kMKLDNN);
     PADDLE_ENFORCE_GE(
         dst_layout, low_bound,
@@ -106,7 +106,7 @@ class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(LoDTensor) The input Tensor");
     AddOutput("Out", "(LoDTensor) The Output Tensor with desired layout");
     AddAttr<int>("dst_layout",
-                 "kNHWC = 0, kNCHW = 1, kAnyLayout = 2, kMKLDNN = 3");
+                 "kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3");
     AddComment(R"DOC(
     TransferLayout Operator)DOC");
   }
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index 1d740093b4fbf..1f09aec05b936 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -66,7 +66,7 @@ class TransferLayoutFunctor {
         // Just set layout/format. No real transform occur
 
         auto out_format = platform::MKLDNNFormatForSize(
-            in_tensor.dims().size(), ToMKLDNNFormat(in_layout));
+            in_tensor.dims().size(), framework::ToMKLDNNFormat(in_layout));
         out_tensor.ShareDataWith(in_tensor);
         // For NHWC data we need reshape of tensors as MKL-DNN
         // is expecting NHWC dims description order
diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu
index dfd48fb47e52f..b7ea5cd953186 100644
--- a/paddle/fluid/operators/triangular_solve_op.cu
+++ b/paddle/fluid/operators/triangular_solve_op.cu
@@ -19,7 +19,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
+class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
+ public:
   void operator()(const Tensor& in, Tensor* out,
                   const framework::ExecutionContext& ctx) {
     // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3]
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 007276b16d7f2..cdb4ad7c40826 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -27,7 +27,7 @@ namespace {
 template <typename T>
 inline void UniformRealDistribution(T *data, const int64_t &size,
                                     const float &min, const float &max,
-                                    const unsigned int &seed) {
+                                    const unsigned int seed) {
   VLOG(4) << "[CPU] UniformRandomKernel<T>";
   std::uniform_real_distribution<T> dist(static_cast<T>(min),
                                          static_cast<T>(max));
@@ -41,8 +41,7 @@ inline void UniformRealDistribution(T *data, const int64_t &size,
 template <>
 inline void UniformRealDistribution(paddle::platform::bfloat16 *data,
                                     const int64_t &size, const float &min,
-                                    const float &max,
-                                    const unsigned int &seed) {
+                                    const float &max, const unsigned int seed) {
   VLOG(4) << "[CPU] UniformRandomKernel<bfloat16>";
   std::uniform_real_distribution<float> dist(min, max);
   auto engine = paddle::framework::GetCPURandomEngine(seed);
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 108cd2722b5ed..8edfb4bc6c52f 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -76,6 +76,65 @@ Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
   }
 };
 
+class Unpool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCDHW. Where N is batch size, C is the "
+        "number of channels, D, H and W is the depth, height and width of "
+        "feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool3d. "
+        "The format of input tensor is NCDHW. Where N is batch size, C is the "
+        "number of channels, D, H and W is the depth, height and width of "
+        "feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCDHW."
+              "Where N is batch size, C is "
+              "the number of channels, D, H and W is the depth, height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(depth, height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>(
+        "strides",
+        "(vector, default:{1, 1, 1}), "
+        "strides (depth, height, width) of unpooling operator.")
+        .SetDefault({1, 1, 1});
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector default:{0, 0,0}), "
+        "paddings (depth, height, width) of unpooling operator.")
+        .SetDefault({0, 0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddAttr<std::vector<int>>("output_size",
+                              "(vector, optional). The shape of output.")
+        .SetDefault({0, 0, 0});
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCDHW)"
+        "Defaults to \"NCDHW\". Specify the data format of the output data, ")
+        .SetDefault("NCDHW");
+    AddComment(R"DOC(
+Input shape is: $(N, C_{in}, D_{in}, H_{in}, W_{in})$, Output shape is:
+$(N, C_{out}, D_{out}, H_{out}, W_{out})$, where
+$$
+D_{out} = (D_{in}-1) * strides[0] - 2 * paddings[0] + ksize[0] \\
+H_{out} = (H_{in}-1) * strides[1] - 2 * paddings[1] + ksize[1] \\
+W_{out} = (W_{in}-1) * strides[2] - 2 * paddings[2] + ksize[2]
+$$
+)DOC");
+  }
+};
+
 int UnpoolOutputSize(int input_size, int ksize, int padding, int stride) {
   int output_size = (input_size - 1) * stride - 2 * padding + ksize;
   return output_size;
@@ -130,6 +189,55 @@ class UnpoolOp : public framework::OperatorWithKernel {
   }
 };
 
+class Unpool3dOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Unpool3d");
+    OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Unpool3d");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Unpool3d");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    std::vector<int> output_size =
+        ctx->Attrs().Get<std::vector<int>>("output_size");
+    PADDLE_ENFORCE_EQ(in_x_dims.size() == 5, true,
+                      platform::errors::InvalidArgument(
+                          "Unpool Intput(X) must be of 5-dimensional, but "
+                          "received Input(X)'s dimensions is %d.",
+                          in_x_dims.size()));
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of Input(X) must equal to be"
+                          "the dimensions of Input(Indices), but received"
+                          "dimensions of Input(X) is [%d], received dimensions"
+                          "of Input(Indices) is [%d]",
+                          in_x_dims, in_y_dims));
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) {
+        output_shape.push_back(-1);
+      } else {
+        output_shape.push_back(output_size[i]);
+      }
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
 template <typename T>
 class UnpoolOpGradMaker : public framework::SingleGradOpMaker<T> {
  public:
@@ -145,6 +253,21 @@ class UnpoolOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class Unpool3dOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Indices", this->Input("Indices"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 class UnpoolOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -163,6 +286,26 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
+
+class Unpool3dOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Unpool3dGrad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "Unpool3dGrad");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -179,3 +322,16 @@ REGISTER_OP_CPU_KERNEL(
     unpool_grad,
     ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OPERATOR(unpool3d, ops::Unpool3dOp, ops::Unpool3dOpMaker,
+                  ops::Unpool3dOpGradMaker<paddle::framework::OpDesc>,
+                  ops::Unpool3dOpGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(unpool3d_grad, ops::Unpool3dOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    unpool3d, ops::Unpool3dKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::Unpool3dKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool3d_grad,
+    ops::Unpool3dGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::Unpool3dGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc
index 7c59a0feaa472..e3cab4426b4d8 100644
--- a/paddle/fluid/operators/unpool_op.cu.cc
+++ b/paddle/fluid/operators/unpool_op.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,3 +22,10 @@ REGISTER_OP_CUDA_KERNEL(
     unpool_grad,
     ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool3d, ops::Unpool3dKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::Unpool3dKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool3d_grad,
+    ops::Unpool3dGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::Unpool3dGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index e388ec5ae3937..52849cb3e0f8e 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -69,5 +69,54 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
   }
 };
+
+template <typename DeviceContext, typename T>
+class Unpool3dKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (output_data) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(dev_ctx, out, static_cast<T>(0));
+    }
+    math::Unpool3dMaxFunctor<DeviceContext, T> unpool3d_max_forward;
+    unpool3d_max_forward(dev_ctx, *in_x, *in_y, out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Unpool3dGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+
+    math::Unpool3dMaxGradFunctor<DeviceContext, T> unpool3d_max_backward;
+    unpool3d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 26bf5d8b1be9d..1031d1ed6357d 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -46,7 +46,7 @@ IF(WITH_XBYAK)
 ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
-cc_library(os_info SRCS os_info.cc DEPS enforce device_tracer)
+cc_library(os_info SRCS os_info.cc DEPS enforce)
 
 IF(WITH_GPU)
     nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
@@ -169,15 +169,16 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer)
 cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto)
 cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)
 
+cc_library(host_event_recorder SRCS host_event_recorder.cc DEPS os_info)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-  nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda)
+  nv_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce dynload_cuda)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
-  hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce)
+  hip_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce)
   hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
-  cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce)
+  cc_library(profiler SRCS profiler.cc DEPS host_event_recorder os_info device_tracer enforce)
   cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
 endif()
 
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index a362e2903f245..100444eee7bc0 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -155,14 +155,24 @@ struct PADDLE_ALIGN(2) bfloat16 {
 
   // Conversion opertors
   HOSTDEVICE inline explicit operator float() const {
+#ifdef PADDLE_WITH_HIP
+    uint32_t res = 0;
+    // We should be using memcpy in order to respect the strict aliasing rule
+    // but it fails in the HIP environment.
+    uint16_t temp = x;
+    uint16_t* temp_ptr = reinterpret_cast<uint16_t*>(&temp);
+    res = *temp_ptr;
+    return res;
+#else
 #ifdef PADDLE_CUDA_BF16
     return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
 #else
     float val = 0.f;
     uint16_t temp = x;
-    memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
-           2);
+    std::memcpy(reinterpret_cast<char*>(&val) + 2,
+                reinterpret_cast<char*>(&temp), 2);
     return val;
+#endif
 #endif
   }
 
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 25f8f3ed9f3d8..7d2ea57545d08 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include <utility>
 
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
 namespace paddle {
@@ -292,17 +293,8 @@ BKCLComm* BKCLCommContext::CreateComm(BKCLUniqueId* bkcl_id, int nranks,
           "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
 
   BKCLContext_t comm = nullptr;
-  auto ret = xpu_set_device(dev_id);
-  PADDLE_ENFORCE_EQ(
-      ret, XPU_SUCCESS,
-      platform::errors::PreconditionNotMet(
-          "XPU API return wrong value[%d %s], please check whether "
-          "Baidu Kunlun Card is properly installed.",
-          ret, XPUAPIErrorMsg[ret]));
-  ret = bkcl_init_rank(&comm, rank, nranks, bkcl_id);
-  PADDLE_ENFORCE_EQ(ret, BKCL_SUCCESS,
-                    platform::errors::PreconditionNotMet(
-                        "bkcl_init_rank failed, got wrong value [%d].", ret));
+  platform::SetXPUDeviceId(dev_id);
+  PADDLE_ENFORCE_XPU_SUCCESS(bkcl_init_rank(&comm, rank, nranks, bkcl_id));
 
   auto* comm_wrapper = AssignBKCLComm(comm, nranks, rank, dev_id, ring_id);
 
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
new file mode 100644
index 0000000000000..43408ca207d1d
--- /dev/null
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/**************************** Enforce Wrapper **************************/
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/enforce.h"
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index e7d807573957f..7fe2367b5510e 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -73,7 +73,7 @@ template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
                                                        int width) {
-  return float16(__shfl_down_sync(mask, static_cast<half>(val),
+  return float16(__shfl_down_sync(mask, val.to_half(),
                                   static_cast<unsigned>(delta), width));
 }
 
@@ -103,7 +103,7 @@ CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
 template <>
 __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
                                                       float16 val, int width) {
-  return float16(__shfl_xor_sync(mask, static_cast<half>(val), width));
+  return float16(__shfl_xor_sync(mask, val.to_half(), width));
 }
 
 template <>
diff --git a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
new file mode 100644
index 0000000000000..43da9bb1fb42d
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/cusparse.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+class CusparseHandleHolder {
+ public:
+  explicit CusparseHandleHolder(cudaStream_t stream) {
+// ROCM is not yet supported
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(handle_, stream));
+#endif
+#endif
+  }
+  const cusparseHandle_t& GetCusparseHandle() const { return handle_; }
+
+  ~CusparseHandleHolder() PADDLE_MAY_THROW {
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 10010
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle_));
+#endif
+#endif
+  }
+
+  template <typename Callback>
+  inline void Call(Callback&& callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CusparseHandleHolder);
+
+  cusparseHandle_t handle_;
+  mutable std::mutex mtx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index 6077a7b625d25..878a122a49224 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -19,6 +19,7 @@
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
 #else
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h"
 #endif
 
 #define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index 3001ef2ff80c4..aeb0e32039bcc 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -286,8 +286,7 @@ void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
 
       auto const_data = std::unique_ptr<popart::ConstVoidData>();
       popart::TensorInfo tensor_info(VarType2PopartType(tensor->type()), shape);
-      const_data.reset(
-          new popart::ConstVoidData(tensor->data<void>(), tensor_info));
+      const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
       popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
       SetIpuIndexStage(result, op_desc);
       one_builder_->tensors.emplace(tensor_name, result);
@@ -320,7 +319,7 @@ void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
             shape.push_back(tensor.dims().at(i));
           }
           popart::TensorInfo tensor_info(dtype, shape);
-          popart::ConstVoidData const_data{tensor.data<void>(), tensor_info};
+          popart::ConstVoidData const_data{tensor.data(), tensor_info};
           popart::TensorId result =
               builder_->addInitializedInputTensor(const_data, var_name);
           one_builder_->tensors.emplace(var_name, result);
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index f6ec666a50425..33a1026506fc2 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -84,8 +84,7 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   std::map<popart::TensorId, PaddleIArray> input_wrappers;
   for (size_t i = 0; i < inputs.size(); i++) {
     auto tensor_id = one_builder_->inputs[i];
-    auto tensor = const_cast<Tensor *>(inputs[i]);
-    input_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    input_wrappers.emplace(tensor_id, PaddleIArray(inputs[i]));
     popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
   }
   // anchors
@@ -93,7 +92,6 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   std::map<popart::TensorId, PaddleIArray> anchor_wrappers;
   for (size_t i = 0; i < outputs.size(); i++) {
     auto tensor_id = one_builder_->outputs[i];
-    auto tensor = const_cast<Tensor *>(outputs[i]);
     // get dims & dtype from session
     auto fetch_info = session_->getInfo(tensor_id);
     auto output_shape = fetch_info.shape();
@@ -110,6 +108,7 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
                           ipu_strategy_->popart_options.replicatedGraphCount);
     }
 
+    auto *tensor = outputs[i];
     tensor->Resize(framework::make_ddim(output_shape));
     auto fetch_dtype = fetch_info.dataType();
     auto paddle_type = PopartType2VarType(fetch_dtype);
@@ -194,7 +193,7 @@ void Executor::SetWeightsIO() {
       }
 
       auto var = scope_->GetVar(paddle_var_name);
-      auto data_ptr = var->GetMutable<framework::LoDTensor>()->data<void>();
+      auto data_ptr = var->GetMutable<framework::LoDTensor>()->data();
 
       auto tensor_info = session_->getInfo(popart_var_name);
       one_session_->weights_io.insert(popart_var_name, {data_ptr, tensor_info});
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc
index 4d68cb81ecdd3..dbe8c75be209b 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -19,7 +19,8 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-void* PaddleIArray::data() { return const_cast<void*>(tensor_->data<void>()); }
+// TODO(alleng) remove const_cast
+void* PaddleIArray::data() { return const_cast<void*>(tensor_->data()); }
 
 popart::DataType PaddleIArray::dataType() const {
   return VarType2PopartType(tensor_->type());
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h
index 44a333b66f463..d8bbc11d0df08 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.h
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -62,7 +62,7 @@ enum ONNXDataType : int {
 
 class PaddleIArray final : public popart::IArray {
  public:
-  explicit PaddleIArray(Tensor* tensor) : tensor_(tensor) {
+  explicit PaddleIArray(const Tensor* tensor) : tensor_(tensor) {
     for (int i = 0; i < tensor->dims().size(); ++i) {
       shape_.push_back(tensor->dims().at(i));
     }
@@ -96,7 +96,7 @@ std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(const Tensor& tensor) {
   popart::TensorInfo tensor_info(dtype, shape);
 
   return std::make_unique<popart::NDArrayWrapper<T>>(
-      reinterpret_cast<T*>(tensor.data<void>()), tensor_info);
+      reinterpret_cast<T*>(tensor.data()), tensor_info);
 }
 
 template <typename T>
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc
index ed74a94c09502..78e5cb0ab106e 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
@@ -401,7 +401,7 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor,
 }
 
 aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
-  void *ptr = tensor.data<void>();
+  void *ptr = tensor.data();
   VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size();
   auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
   PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index 39c1fc9d041ea..e83057e682fef 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -150,8 +150,8 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) {
     *npu_pinned_ptr = val;
 
     memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
-                 tensor->data<void>(), npu_pinned_place, npu_pinned_ptr,
-                 sizeof(T), GetCurrentNPUStream());
+                 tensor->data(), npu_pinned_place, npu_pinned_ptr, sizeof(T),
+                 GetCurrentNPUStream());
 
     auto npu_pinned_allocator =
         static_cast<paddle::memory::allocation::NPUPinnedAllocator *>(
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index 17f492f93e534..b1fc9a0cedd0b 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -6,3 +6,5 @@ set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
 cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)
+
+add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h
index cccee15719488..d9ffbfe011f91 100644
--- a/paddle/fluid/platform/device/xpu/bkcl_helper.h
+++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h
@@ -26,8 +26,8 @@
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/place.h"
 #include "xpu/bkcl.h"
@@ -73,13 +73,9 @@ struct InitBKCLPara {
 
 static void *init_bkcl_context_func(void *args) {
   struct InitBKCLPara *para = (struct InitBKCLPara *)args;
-  PADDLE_ENFORCE_EQ(xpu_set_device(para->dev_id), XPU_SUCCESS,
-                    platform::errors::PreconditionNotMet(
-                        "xpu_set_device failed[%d]", para->dev_id));
-  PADDLE_ENFORCE_EQ(
-      bkcl_init_rank(para->ctx, para->rank, para->nranks, para->bkcl_id),
-      BKCL_SUCCESS,
-      platform::errors::PreconditionNotMet("bkcl_init_rank failed"));
+  platform::SetXPUDeviceId(para->dev_id);
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      bkcl_init_rank(para->ctx, para->rank, para->nranks, para->bkcl_id));
   return nullptr;
 }
 
diff --git a/paddle/fluid/platform/device/xpu/enforce_xpu.h b/paddle/fluid/platform/device/xpu/enforce_xpu.h
new file mode 100644
index 0000000000000..839f14067782d
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/enforce_xpu.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "xpu/bkcl.h"
+
+namespace paddle {
+namespace platform {
+
+// Note: XPU runtime api return int, not XPUError_t
+inline const char* xpuGetErrorString(int stat) {
+  switch (stat) {
+    case XPU_SUCCESS:
+      return "Success";
+    case XPUERR_INVALID_DEVICE:
+      return "Invalid XPU device";
+    case XPUERR_UNINIT:
+      return "XPU runtime not properly inited";
+    case XPUERR_NOMEM:
+      return "Device memory not enough";
+    case XPUERR_NOCPUMEM:
+      return "CPU memory not enough";
+    case XPUERR_INVALID_PARAM:
+      return "Invalid parameter";
+    case XPUERR_NOXPUFUNC:
+      return "Cannot get XPU Func";
+    case XPUERR_LDSO:
+      return "Error loading dynamic library";
+    case XPUERR_LDSYM:
+      return "Error loading func from dynamic library";
+    case XPUERR_SIMULATOR:
+      return "Error from XPU Simulator";
+    case XPUERR_NOSUPPORT:
+      return "Operation not supported";
+    case XPUERR_ABNORMAL:
+      return "Device abnormal due to previous error";
+    case XPUERR_KEXCEPTION:
+      return "Exception in kernel execution";
+    case XPUERR_TIMEOUT:
+      return "Kernel execution timed out";
+    case XPUERR_BUSY:
+      return "Resource busy";
+    case XPUERR_USEAFCLOSE:
+      return "Use a stream after closed";
+    case XPUERR_UCECC:
+      return "Uncorrectable ECC";
+    case XPUERR_OVERHEAT:
+      return "Overheat";
+    case XPUERR_UNEXPECT:
+      return "Execution error, reach unexpected control flow";
+    case XPUERR_DEVRESET:
+      return "Device is being reset, try again later";
+    case XPUERR_HWEXCEPTION:
+      return "Hardware module exception";
+    case XPUERR_HBM_INIT:
+      return "Error init HBM";
+    case XPUERR_DEVINIT:
+      return "Error init device";
+    case XPUERR_PEERRESET:
+      return "Device is being reset, try again later";
+    case XPUERR_MAXDEV:
+      return "Device count exceed limit";
+    case XPUERR_NOIOC:
+      return "Unknown IOCTL command";
+    case XPUERR_DMATIMEOUT:
+      return "DMA timed out, a reboot maybe needed";
+    case XPUERR_DMAABORT:
+      return "DMA aborted due to error, possibly wrong address or hardware "
+             "state";
+    case XPUERR_MCUUNINIT:
+      return "Firmware not initialized";
+    case XPUERR_OLDFW:
+      return "Firmware version too old (<15), please update.";
+    case XPUERR_PCIE:
+      return "Error in PCIE";
+    case XPUERR_FAULT:
+      return "Error copy between kernel and user space";
+    case XPUERR_INTERRUPTED:
+      return "Execution interrupted by user";
+    default:
+      return "unkonwn error";
+  }
+}
+
+inline const char* bkclGetErrorString(BKCLResult_t stat) {
+  switch (stat) {
+    case BKCL_SUCCESS:
+      return "BKCL_SUCCESS";
+    case BKCL_INVALID_ARGUMENT:
+      return "BKCL_INVALID_ARGUMENT";
+    case BKCL_RUNTIME_ERROR:
+      return "BKCL_RUNTIME_ERROR";
+    case BKCL_SYSTEM_ERROR:
+      return "BKCL_SYSTEM_ERROR";
+    case BKCL_INTERNAL_ERROR:
+      return "BKCL_INTERNAL_ERROR";
+    default:
+      return "Unknown BKCL status";
+  }
+}
+
+inline std::string build_xpu_error_msg(int stat) {
+  std::string msg("XPU Error <" + std::to_string(stat) + ">, ");
+  return msg + xpuGetErrorString(stat) + " ";
+}
+
+inline std::string build_xpu_error_msg(BKCLResult_t stat) {
+  std::string msg("BKCL Error, ");
+  return msg + bkclGetErrorString(stat) + " ";
+}
+
+namespace details {
+
+template <typename T>
+struct ExternalApiType {};
+
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
+  }
+
+DEFINE_EXTERNAL_API_TYPE(int, XPU_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS);
+
+#undef DEFINE_EXTERNAL_API_TYPE
+
+}  // namespace details
+
+#define PADDLE_ENFORCE_XPU_SUCCESS(COND)                      \
+  do {                                                        \
+    auto __cond__ = (COND);                                   \
+    using __XPU_STATUS_TYPE__ = decltype(__cond__);           \
+    constexpr auto __success_type__ =                         \
+        ::paddle::platform::details::ExternalApiType<         \
+            __XPU_STATUS_TYPE__>::kSuccess;                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {             \
+      auto __summary__ = paddle::platform::errors::External(  \
+          ::paddle::platform::build_xpu_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                  \
+    }                                                         \
+  } while (0)
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt b/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt
new file mode 100644
index 0000000000000..6d98fefcf8317
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/tests/CMakeLists.txt
@@ -0,0 +1 @@
+cc_test(enforce_xpu_test SRCS enforce_xpu_test.cc DEPS stringpiece)
diff --git a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
new file mode 100644
index 0000000000000..730bcdb37fd7b
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+#include "gtest/gtest.h"
+
+template <typename T>
+bool CheckXPUStatusSuccess(T value, const std::string& msg = "success") {
+  PADDLE_ENFORCE_XPU_SUCCESS(value);
+  return true;
+}
+
+template <typename T>
+bool CheckXPUStatusFailure(T value, const std::string& msg) {
+  try {
+    PADDLE_ENFORCE_XPU_SUCCESS(value);
+    return false;
+  } catch (paddle::platform::EnforceNotMet& error) {
+    std::string ex_msg = error.what();
+    std::cout << ex_msg << std::endl;
+    return ex_msg.find(msg) != std::string::npos;
+  }
+}
+
+TEST(enforce, xpu_status) {
+  EXPECT_TRUE(CheckXPUStatusSuccess(static_cast<int>(XPU_SUCCESS)));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_INVALID_DEVICE),
+                                    "Invalid XPU device"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_UNINIT),
+                                    "XPU runtime not properly inited"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOMEM),
+                                    "Device memory not enough"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOCPUMEM),
+                                    "CPU memory not enough"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_INVALID_PARAM),
+                                    "Invalid parameter"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOXPUFUNC),
+                                    "Cannot get XPU Func"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_LDSO),
+                                    "Error loading dynamic library"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_LDSYM),
+                                    "Error loading func from dynamic library"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_SIMULATOR),
+                                    "Error from XPU Simulator"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOSUPPORT),
+                                    "Operation not supported"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_ABNORMAL),
+                                    "Device abnormal due to previous error"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_KEXCEPTION),
+                                    "Exception in kernel execution"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_TIMEOUT),
+                                    "Kernel execution timed out"));
+  EXPECT_TRUE(
+      CheckXPUStatusFailure(static_cast<int>(XPUERR_BUSY), "Resource busy"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_USEAFCLOSE),
+                                    "Use a stream after closed"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_UCECC),
+                                    "Uncorrectable ECC"));
+  EXPECT_TRUE(
+      CheckXPUStatusFailure(static_cast<int>(XPUERR_OVERHEAT), "Overheat"));
+  EXPECT_TRUE(
+      CheckXPUStatusFailure(static_cast<int>(XPUERR_UNEXPECT),
+                            "Execution error, reach unexpected control flow"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_DEVRESET),
+                                    "Device is being reset, try again later"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_HWEXCEPTION),
+                                    "Hardware module exception"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_HBM_INIT),
+                                    "Error init HBM"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_DEVINIT),
+                                    "Error init device"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_PEERRESET),
+                                    "Device is being reset, try again later"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_MAXDEV),
+                                    "Device count exceed limit"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_NOIOC),
+                                    "Unknown IOCTL command"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_DMATIMEOUT),
+                                    "DMA timed out, a reboot maybe needed"));
+  EXPECT_TRUE(CheckXPUStatusFailure(
+      static_cast<int>(XPUERR_DMAABORT),
+      "DMA aborted due to error, possibly wrong address or hardware state"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_MCUUNINIT),
+                                    "Firmware not initialized"));
+  EXPECT_TRUE(
+      CheckXPUStatusFailure(static_cast<int>(XPUERR_OLDFW),
+                            "Firmware version too old (<15), please update."));
+  EXPECT_TRUE(
+      CheckXPUStatusFailure(static_cast<int>(XPUERR_PCIE), "Error in PCIE"));
+  EXPECT_TRUE(
+      CheckXPUStatusFailure(static_cast<int>(XPUERR_FAULT),
+                            "Error copy between kernel and user space"));
+  EXPECT_TRUE(CheckXPUStatusFailure(static_cast<int>(XPUERR_INTERRUPTED),
+                                    "Execution interrupted by user"));
+}
+
+TEST(enforce, bkcl_status) {
+  EXPECT_TRUE(CheckXPUStatusSuccess(BKCL_SUCCESS));
+  EXPECT_TRUE(
+      CheckXPUStatusFailure(BKCL_INVALID_ARGUMENT, "BKCL_INVALID_ARGUMENT"));
+  EXPECT_TRUE(CheckXPUStatusFailure(BKCL_RUNTIME_ERROR, "BKCL_RUNTIME_ERROR"));
+  EXPECT_TRUE(CheckXPUStatusFailure(BKCL_SYSTEM_ERROR, "BKCL_SYSTEM_ERROR"));
+  EXPECT_TRUE(
+      CheckXPUStatusFailure(BKCL_INTERNAL_ERROR, "BKCL_INTERNAL_ERROR"));
+}
diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
index d4fd42d7a971e..26a1426bea036 100644
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -219,6 +219,7 @@ XPUOpMap& get_kl1_ops() {
       {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_mean_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_sum_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -262,6 +263,8 @@ XPUOpMap& get_kl1_ops() {
       {"softmax_with_cross_entropy_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
       {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 74f519c7a8617..79261a5d7bc88 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -29,9 +29,15 @@ using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 XPUOpMap& get_kl2_ops() {
   // KL1支持的op，通过op_name, data_type, place来索引
   static XPUOpMap s_xpu2_kernels{
+      {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"abs_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                 pOpKernelType(vartype::FP16, XPUPlace())})},
       {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"argsort", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
       {"assign_value",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"batch_norm_grad",
@@ -103,6 +109,7 @@ XPUOpMap& get_kl2_ops() {
       {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"expand_as_v2",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -182,6 +189,12 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_swish_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"huber_loss_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"huber_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"iou_similarity",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"label_smooth",
@@ -194,6 +207,9 @@ XPUOpMap& get_kl2_ops() {
       {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                    pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"leaky_relu_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                    pOpKernelType(vartype::INT32, XPUPlace()),
                                    pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -221,6 +237,10 @@ XPUOpMap& get_kl2_ops() {
       {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"nearest_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -233,12 +253,17 @@ XPUOpMap& get_kl2_ops() {
       {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reciprocal_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"reduce_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_mean_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_sum_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -255,12 +280,20 @@ XPUOpMap& get_kl2_ops() {
                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                  pOpKernelType(vartype::BOOL, XPUPlace()),
                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"roi_align_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::FP16, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
       {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                    pOpKernelType(vartype::FP16, XPUPlace()),
                                    pOpKernelType(vartype::INT32, XPUPlace())})},
@@ -280,6 +313,8 @@ XPUOpMap& get_kl2_ops() {
       {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                 pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
       {"squeeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -318,6 +353,7 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"unsqueeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -339,6 +375,7 @@ XPUOpMap& get_kl2_ops() {
       {"where", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
+
       // AddMore
   };
 
diff --git a/paddle/fluid/platform/device/xpu/xpu_header.h b/paddle/fluid/platform/device/xpu/xpu_header.h
index fe75290c252df..1177fd63742b3 100644
--- a/paddle/fluid/platform/device/xpu/xpu_header.h
+++ b/paddle/fluid/platform/device/xpu/xpu_header.h
@@ -1,16 +1,16 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 
@@ -21,37 +21,14 @@
 
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/float16.h"
+
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
 #include "xpu/xdnn.h"
 
 namespace xpu = baidu::xpu::api;
 
-class XPUActHelper {
- public:
-  // Convert string to activation type in xpu
-  static xpu::Activation_t ConvertToXpuActType(
-      const std::string& act_type_str) {
-    static std::unordered_map<std::string, xpu::Activation_t> str2act = {
-        {"linear", xpu::Activation_t::LINEAR},
-        {"relu", xpu::Activation_t::RELU},
-        {"sigmoid", xpu::Activation_t::SIGMOID},
-        {"tanh", xpu::Activation_t::TANH},
-        {"gelu", xpu::Activation_t::GELU},
-        {"leaky_relu", xpu::Activation_t::LEAKY_RELU},
-        {"sqrt", xpu::Activation_t::SQRT},
-        {"square", xpu::Activation_t::SQUARE}};
-
-    auto res = str2act.find(act_type_str);
-    PADDLE_ENFORCE_NE(res, str2act.end(),
-                      paddle::platform::errors::InvalidArgument(
-                          "Invalid activation type(%s) in XPU", act_type_str));
-    return res->second;
-  }
-};
-
 static std::map<int, std::string> XPUAPIErrorMsg = {
     {xpu::Error_t::SUCCESS, "xpu api success"},
     {xpu::Error_t::INVALID_PARAM, "xpu api invalid param"},
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index adc8bcc22da98..483b1c5ce2795 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 #include <cstdlib>
 #include <string>
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/split.h"
 
 PADDLE_DEFINE_EXPORTED_string(
@@ -31,7 +31,31 @@ PADDLE_DEFINE_EXPORTED_string(
 namespace paddle {
 namespace platform {
 
-static int GetXPUDeviceCountImpl() {
+/**************************** Version Management **************************/
+
+//! Get the version of XPU Driver
+int GetDriverVersion() {
+  uint32_t driver_version_major = 0;
+  uint32_t driver_version_minor = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_get_driver_version(&driver_version_major, &driver_version_minor));
+  int driver_version = driver_version_major * 10 + driver_version_minor;
+  return driver_version;
+}
+
+//! Get the version of XPU Runtime
+int GetRuntimeVersion() {
+  uint32_t rumtime_version_major = 0;
+  uint32_t rumtime_version_minor = 0;
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_get_runtime_version(&rumtime_version_major, &rumtime_version_minor));
+  int runtime_version = rumtime_version_major * 10 + rumtime_version_minor;
+  return runtime_version;
+}
+
+/**************************** Device Management **************************/
+
+static int GetDeviceCountImpl() {
   const auto *xpu_visible_devices = std::getenv("XPU_VISIBLE_DEVICES");
   if (xpu_visible_devices != nullptr) {
     std::string xpu_visible_devices_str(xpu_visible_devices);
@@ -44,29 +68,18 @@ static int GetXPUDeviceCountImpl() {
   }
 
   int count = 0;
-  int ret = xpu_device_count(&count);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_count(&count));
   return count;
 }
 
 int GetXPUDeviceCount() {
-  static auto dev_cnt = GetXPUDeviceCountImpl();
+  static auto dev_cnt = GetDeviceCountImpl();
   return dev_cnt;
 }
 
 int GetXPUCurrentDeviceId() {
   int dev_id;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_current_device(&dev_id));
   if (dev_id >= 64) {
     // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
     dev_id -= 64;
@@ -74,6 +87,13 @@ int GetXPUCurrentDeviceId() {
   return dev_id;
 }
 
+void SetXPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id, GetXPUDeviceCount(),
+      platform::errors::InvalidArgument("id must less than XPU count"));
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id));
+}
+
 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetXPUSelectedDevices() {
   // use user specified XPUs in single-node multi-process mode.
@@ -92,24 +112,38 @@ std::vector<int> GetXPUSelectedDevices() {
   return devices;
 }
 
-void SetXPUDeviceId(int id) {
-  PADDLE_ENFORCE_LT(
-      id, GetXPUDeviceCount(),
-      platform::errors::InvalidArgument("id must less than XPU count"));
-  int ret = xpu_set_device(id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+/**************************** Memory Management **************************/
+
+void MemcpySyncH2D(void *dst, const void *src, size_t count, int dev_id) {
+  platform::XPUDeviceGuard guard(dev_id);
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+}
+
+void MemcpySyncD2H(void *dst, const void *src, size_t count, int dev_id) {
+  platform::XPUDeviceGuard guard(dev_id);
+  PADDLE_ENFORCE_XPU_SUCCESS(
+      xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+}
+
+void MemcpySyncD2D(void *dst, int dst_id, const void *src, int src_id,
+                   size_t count) {
+  int dev_id = GetXPUCurrentDeviceId();
+  if (dst_id == dev_id && src_id == dev_id) {
+    platform::XPUDeviceGuard guard(dev_id);
+    PADDLE_ENFORCE_XPU_SUCCESS(
+        xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE));
+  } else {
+    PADDLE_ENFORCE_XPU_SUCCESS(
+        xpu_memcpy_peer(dst_id, dst, src_id, src, count));
+  }
 }
 
+/**************************** Others **************************/
+
 XPUVersion get_xpu_version(int dev_id) {
   uint64_t v = 0;
-  int ret = xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "xpu_device_get_attr return wrong value[%d]", ret));
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id));
 
   if (v == K100 || v == K200) {
     VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 3cb79d51eb7bb..82672e61e51f4 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -16,17 +16,35 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+/***** Version Management *****/
+
+//! Get the version of XPU Driver
+int GetDriverVersion();
+
+//! Get the version of XPU Runtime
+int GetRuntimeVersion();
+
+/***** Device Management *****/
+
 //! Get the total number of XPU devices in system.
 int GetXPUDeviceCount();
 
+//! Set the XPU device id for next execution.
+void SetXPUDeviceId(int device_id);
+
 //! Get the current XPU device id in system.
 int GetXPUCurrentDeviceId();
 
 //! Get a list of device ids from environment variable or use all.
 std::vector<int> GetXPUSelectedDevices();
 
-//! Set the XPU device id for next execution.
-void SetXPUDeviceId(int device_id);
+/***** Memory Management *****/
+
+//! Copy memory from address src to dst synchronously.
+void MemcpySyncH2D(void *dst, const void *src, size_t count, int dev_id);
+void MemcpySyncD2H(void *dst, const void *src, size_t count, int dev_id);
+void MemcpySyncD2D(void *dst, int dst_id, const void *src, int src_id,
+                   size_t count);
 
 class XPUDeviceGuard {
  public:
@@ -44,8 +62,8 @@ class XPUDeviceGuard {
     }
   }
 
-  XPUDeviceGuard(const XPUDeviceGuard& o) = delete;
-  XPUDeviceGuard& operator=(const XPUDeviceGuard& o) = delete;
+  XPUDeviceGuard(const XPUDeviceGuard &o) = delete;
+  XPUDeviceGuard &operator=(const XPUDeviceGuard &o) = delete;
 
  private:
   int prev_id_{-1};
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 415444da9eb77..a8bec4b1fe20a 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -260,19 +260,7 @@ XPUDeviceContext::XPUDeviceContext() {
 XPUDeviceContext::~XPUDeviceContext() {}
 
 XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
-  int dev_id = -1;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  ret = xpu_set_device(place.device);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+  platform::XPUDeviceGuard guard(place.device);
 
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: " << place_.device;
 
@@ -299,22 +287,10 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
       break;
     }
   }
-
-  ret = xpu_set_device(dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
 }
 
 void XPUDeviceContext::Wait() const {
-  int ret = xpu_set_device(place_.device);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+  platform::SetXPUDeviceId(place_.device);
   xpu_wait(context_->xpu_stream);
 }
 
@@ -480,15 +456,37 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
   InitCuBlasContext();
   InitCuDNNContext();
 #ifndef PADDLE_WITH_HIP
+  InitCuSparseContext();
   InitCuSolverContext();
 #endif
 }
 
+void CUDAContext::SetStream(gpuStream_t stream) {
+  if (stream_->raw_stream() != stream) {
+    CUDADeviceGuard guard(place_.device);
+    DestoryCuDNNContext();
+    DestoryCuBlasContext();
+#ifndef PADDLE_WITH_HIP
+    DestoryCuSolverContext();
+#endif
+
+    stream_->SetStream(stream);
+
+    InitEigenContext();
+    InitCuBlasContext();
+    InitCuDNNContext();
+#ifndef PADDLE_WITH_HIP
+    InitCuSolverContext();
+#endif
+  }
+}
+
 CUDAContext::~CUDAContext() {
   CUDADeviceGuard guard(place_.device);
   DestoryCuDNNContext();
   DestoryCuBlasContext();
 #ifndef PADDLE_WITH_HIP
+  DestoryCuSparseContext();
   DestoryCuSolverContext();
 #endif
 }
@@ -606,6 +604,9 @@ rocblas_handle CUDADeviceContext::cublas_handle() const {
 cublasHandle_t CUDADeviceContext::cublas_handle() const {
   return context()->CublasHandle()->GetCublasHandle();
 }
+cusparseHandle_t CUDADeviceContext::cusparse_handle() const {
+  return context()->CusparseHandle()->GetCusparseHandle();
+}
 #endif
 
 CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
@@ -845,6 +846,15 @@ unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
   return num_entries;
 }
 
+// TODO(jczaja): Replace with C++20 equivalents when applicable
+#ifdef _WIN32
+#define likely(expr) (expr)
+#define unlikely(expr) (expr)
+#else
+#define likely(expr) (__builtin_expect(!!(expr), 1))
+#define unlikely(expr) (__builtin_expect(!!(expr), 0))
+#endif
+
 MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
     const std::string& name) const {
   BlobMap* pMap = p_blobmap_.get();
@@ -857,7 +867,10 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 
   // Find ShapeBlob for current mkldnn session id firstly
   auto map_it = pMap->find(sid);
-  if (map_it == pMap->end()) {
+  // (jczaja): After first iteration of model's execution we
+  // should have all elements cached (mostly) so failures are unlikely (less
+  // likely for dynamic shapes)
+  if (unlikely(map_it == pMap->end())) {
     VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n";
     return nullptr;
   }
@@ -865,7 +878,7 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 
   // Find KeyBlob for current input shape secondly
   auto sBlob_it = sBlob->find(tls().cur_input_shape_str);
-  if (sBlob_it == sBlob->end()) {
+  if (unlikely(sBlob_it == sBlob->end())) {
     VLOG(2) << "GetBlob: sid=" << tls().cur_input_shape_str
             << ", miss input_shape_str\n";
     return nullptr;
@@ -875,7 +888,7 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
   // Find Blob via name
   auto key_it = pBlob->find(name);
 
-  if (key_it == pBlob->end()) {
+  if (unlikely(key_it == pBlob->end())) {
     VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n";
     return nullptr;
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index f790743099cfe..853cf984fd018 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -332,6 +332,8 @@ class CUDAContext {
     return old_stream_ptr;
   }
 
+  void SetStream(gpuStream_t stream);
+
   const gpuStream_t& RawStream() { return stream_->raw_stream(); }
 
 #ifdef PADDLE_WITH_HIP
@@ -354,6 +356,12 @@ class CUDAContext {
     return cublas_tensor_core_handle_;
   }
 
+#ifndef PADDLE_WITH_HIP
+  const std::unique_ptr<CusparseHandleHolder>& CusparseHandle() const {
+    return cusparse_handle_;
+  }
+#endif
+
   /*! \brief  Call cublas function safely. */
   template <typename Callback>
   inline void CublasCall(Callback&& callback) const {
@@ -364,6 +372,14 @@ class CUDAContext {
     }
   }
 
+#ifndef PADDLE_WITH_HIP
+  /*! \brief  Call cusparse function safely. */
+  template <typename Callback>
+  inline void CusparseCall(Callback&& callback) const {
+    cusparse_handle_->Call(std::forward<Callback>(callback));
+  }
+#endif
+
   /*! \brief  Check whether tensor core is supported */
   bool tensor_core_available() const;
 
@@ -402,6 +418,12 @@ class CUDAContext {
   }
 #endif
 
+#ifndef PADDLE_WITH_HIP
+  void InitCuSparseContext() {
+    cusparse_handle_.reset(new CusparseHandleHolder(RawStream()));
+  }
+#endif
+
   void InitCuDNNContext() {
     if (dynload::HasCUDNN()) {
 #ifdef PADDLE_WITH_HIP
@@ -474,6 +496,10 @@ class CUDAContext {
     cublas_tf32_tensor_core_handle_.reset();
   }
 
+#ifndef PADDLE_WITH_HIP
+  void DestoryCuSparseContext() { cusparse_handle_.reset(); }
+#endif
+
 #ifndef PADDLE_WITH_HIP
   void DestoryCuSolverContext() {
     if (cusolver_dn_handle_) {
@@ -497,6 +523,7 @@ class CUDAContext {
   std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
 #ifndef PADDLE_WITH_HIP
   cusolverDnHandle_t cusolver_dn_handle_;
+  std::unique_ptr<CusparseHandleHolder> cusparse_handle_;
 #endif
   DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };
@@ -536,6 +563,14 @@ class CUDADeviceContext : public DeviceContext {
     return context()->CublasCall(callback);
   }
 
+#ifndef PADDLE_WITH_HIP
+  /*! \brief  Call cusparse function safely. */
+  template <typename Callback>
+  inline void CusparseCall(Callback&& callback) const {
+    return context()->CusparseCall(callback);
+  }
+#endif
+
   /*! \brief  Check whether tensor core is supported */
   bool tensor_core_available() const;
 
@@ -558,6 +593,7 @@ class CUDADeviceContext : public DeviceContext {
   rocblas_handle cublas_handle() const;
 #else
   cublasHandle_t cublas_handle() const;
+  cusparseHandle_t cusparse_handle() const;
 #endif
 
   /*! \brief  Return a cudnn workspace handle to call multiple cudnn
@@ -614,6 +650,11 @@ class CUDADeviceContext : public DeviceContext {
     return thread_ctx_.at(this);
   }
 
+  // Note: Can only be used under thread_local semantics.
+  void SetThreadLocalStream(const gpuStream_t stream) {
+    thread_ctx_.at(this)->SetStream(stream);
+  }
+
  private:
   CUDAPlace place_;
   std::shared_ptr<CUDAContext> default_ctx_;
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 28c51251627c5..ff11bfd62c138 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -348,7 +348,7 @@ class DeviceTracerImpl : public DeviceTracer {
   }
 
   void AddCPURecords(const std::string &anno, uint64_t start_ns,
-                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {
+                     uint64_t end_ns, int64_t device_id, uint64_t thread_id) {
     if (anno.empty()) {
       VLOG(1) << "Empty timeline annotation.";
       return;
@@ -383,7 +383,7 @@ class DeviceTracerImpl : public DeviceTracer {
 
   void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                         const Place &place, const std::string &alloc_in,
-                        const std::string &free_in, int64_t thread_id) {
+                        const std::string &free_in, uint64_t thread_id) {
     if (0 == start_ns || 0 == end_ns) {
       VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
       return;
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 9d6e435c8457f..4cb01529506b7 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -50,7 +50,7 @@ class DeviceTracer {
     uint64_t start_ns;
     uint64_t end_ns;
     int64_t device_id;
-    int64_t thread_id;
+    uint64_t thread_id;
   };
 
   struct MemRecord {
@@ -68,7 +68,7 @@ class DeviceTracer {
     uint64_t end_ns;
     size_t bytes;
     Place place;
-    int64_t thread_id;
+    uint64_t thread_id;
     std::string alloc_in;
     std::string free_in;
   };
@@ -105,7 +105,7 @@ class DeviceTracer {
 
   virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
                              uint64_t end_ns, int64_t device_id,
-                             int64_t thread_id) = 0;
+                             uint64_t thread_id) = 0;
   virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
                                     uint64_t end_ns, int64_t device_id,
                                     uint64_t thread_id,
@@ -115,7 +115,7 @@ class DeviceTracer {
                                 size_t bytes, const Place& place,
                                 const std::string& alloc_in,
                                 const std::string& free_in,
-                                int64_t thread_id) = 0;
+                                uint64_t thread_id) = 0;
 
   // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
   // added before for human readability.
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index 4c018908b5945..f9dc6baea3c29 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -49,6 +49,10 @@ extern void *cusolver_dso_handle;
   __macro(cusolverDnDpotrf_bufferSize); \
   __macro(cusolverDnSpotrf);            \
   __macro(cusolverDnDpotrf);            \
+  __macro(cusolverDnSpotrs);            \
+  __macro(cusolverDnDpotrs);            \
+  __macro(cusolverDnCpotrs);            \
+  __macro(cusolverDnZpotrs);            \
   __macro(cusolverDnSsyevd_bufferSize); \
   __macro(cusolverDnDsyevd_bufferSize); \
   __macro(cusolverDnCheevd_bufferSize); \
@@ -64,7 +68,13 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
 #define CUSOLVER_ROUTINE_EACH_R1(__macro) \
   __macro(cusolverDnSpotrfBatched);       \
   __macro(cusolverDnDpotrfBatched);       \
+  __macro(cusolverDnSpotrsBatched);       \
+  __macro(cusolverDnDpotrsBatched);       \
   __macro(cusolverDnSgesvdj_bufferSize);  \
+  __macro(cusolverDnSgetrf_bufferSize);   \
+  __macro(cusolverDnDgetrf_bufferSize);   \
+  __macro(cusolverDnCgetrf_bufferSize);   \
+  __macro(cusolverDnZgetrf_bufferSize);   \
   __macro(cusolverDnSgeqrf_bufferSize);   \
   __macro(cusolverDnDgeqrf_bufferSize);   \
   __macro(cusolverDnCgeqrf_bufferSize);   \
@@ -78,6 +88,10 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP);
   __macro(cusolverDnDgesvdj_bufferSize);  \
   __macro(cusolverDnSgesvdj);             \
   __macro(cusolverDnDgesvdj);             \
+  __macro(cusolverDnSgetrf);              \
+  __macro(cusolverDnDgetrf);              \
+  __macro(cusolverDnCgetrf);              \
+  __macro(cusolverDnZgetrf);              \
   __macro(cusolverDnSgeqrf);              \
   __macro(cusolverDnDgeqrf);              \
   __macro(cusolverDnCgeqrf);              \
diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc
index 2a1fe322dabcf..be67f121d68ed 100644
--- a/paddle/fluid/platform/dynload/cusparse.cc
+++ b/paddle/fluid/platform/dynload/cusparse.cc
@@ -30,6 +30,10 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 #ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
 CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
+
+#ifdef CUSPARSE_ROUTINE_EACH_11020
+CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
index e44e8ed08560f..fc842a3377b63 100644
--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -41,21 +41,41 @@ extern void *cusparse_dso_handle;
   };                                                                 \
   extern DynLoad__##__name __name
 
-#if !defined(PADDLE_WITH_ARM) && !defined(_WIN32)
-// APIs available after CUDA 11.0
-#if CUDA_VERSION >= 11000
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
 #define CUSPARSE_ROUTINE_EACH(__macro) \
   __macro(cusparseCreate);             \
-  __macro(cusparseCreateCsr);          \
-  __macro(cusparseCreateDnMat);        \
-  __macro(cusparseSpMM_bufferSize);    \
-  __macro(cusparseSpMM);               \
-  __macro(cusparseDestroySpMat);       \
-  __macro(cusparseDestroyDnMat);       \
-  __macro(cusparseDestroy);
+  __macro(cusparseSetStream);          \
+  __macro(cusparseCreateMatDescr);     \
+  __macro(cusparseDestroy);            \
+  __macro(cusparseSnnz);               \
+  __macro(cusparseDnnz);               \
+  __macro(cusparseSetMatType);         \
+  __macro(cusparseSetMatIndexBase);
 
 CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
 
+// APIs available after CUDA 11.2
+#if CUDA_VERSION >= 11020
+#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
+  __macro(cusparseCreateCsr);                \
+  __macro(cusparseCreateCoo);                \
+  __macro(cusparseCreateDnMat);              \
+  __macro(cusparseSpMM_bufferSize);          \
+  __macro(cusparseSpMM);                     \
+  __macro(cusparseDestroySpMat);             \
+  __macro(cusparseDestroyDnMat);             \
+  __macro(cusparseCooSetPointers);           \
+  __macro(cusparseCsrSetPointers);           \
+  __macro(cusparseDenseToSparse_bufferSize); \
+  __macro(cusparseDenseToSparse_analysis);   \
+  __macro(cusparseDenseToSparse_convert);    \
+  __macro(cusparseSparseToDense_bufferSize); \
+  __macro(cusparseSparseToDense);
+
+CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+
 // APIs available after CUDA 11.3
 #if CUDA_VERSION >= 11030
 #define CUSPARSE_ROUTINE_EACH_R2(__macro) \
@@ -67,6 +87,7 @@ CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif
 #endif
 #endif
+#endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/lapack.h b/paddle/fluid/platform/dynload/lapack.h
index 9b4dd3d9e3ce5..ce24b98defbe9 100644
--- a/paddle/fluid/platform/dynload/lapack.h
+++ b/paddle/fluid/platform/dynload/lapack.h
@@ -66,6 +66,48 @@ extern "C" void cgeev_(char *jobvl, char *jobvr, int *n, std::complex<float> *a,
                        std::complex<float> *work, int *lwork, float *rwork,
                        int *info);
 
+// gels
+extern "C" void dgels_(char *trans, int *m, int *n, int *nrhs, double *a,
+                       int *lda, double *b, int *ldb, double *work, int *lwork,
+                       int *info);
+extern "C" void sgels_(char *trans, int *m, int *n, int *nrhs, float *a,
+                       int *lda, float *b, int *ldb, float *work, int *lwork,
+                       int *info);
+
+// gelsd
+extern "C" void dgelsd_(int *m, int *n, int *nrhs, double *a, int *lda,
+                        double *b, int *ldb, double *s, double *rcond,
+                        int *rank, double *work, int *lwork, int *iwork,
+                        int *info);
+extern "C" void sgelsd_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
+                        int *ldb, float *s, float *rcond, int *rank,
+                        float *work, int *lwork, int *iwork, int *info);
+
+// gelsy
+extern "C" void dgelsy_(int *m, int *n, int *nrhs, double *a, int *lda,
+                        double *b, int *ldb, int *jpvt, double *rcond,
+                        int *rank, double *work, int *lwork, int *info);
+extern "C" void sgelsy_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
+                        int *ldb, int *jpvt, float *rcond, int *rank,
+                        float *work, int *lwork, int *info);
+
+// gelss
+extern "C" void dgelss_(int *m, int *n, int *nrhs, double *a, int *lda,
+                        double *b, int *ldb, double *s, double *rcond,
+                        int *rank, double *work, int *lwork, int *info);
+extern "C" void sgelss_(int *m, int *n, int *nrhs, float *a, int *lda, float *b,
+                        int *ldb, float *s, float *rcond, int *rank,
+                        float *work, int *lwork, int *info);
+
+extern "C" void zpotrs_(char *uplo, int *n, int *nrhs, std::complex<double> *a,
+                        int *lda, std::complex<double> *b, int *ldb, int *info);
+extern "C" void cpotrs_(char *uplo, int *n, int *nrhs, std::complex<float> *a,
+                        int *lda, std::complex<float> *b, int *ldb, int *info);
+extern "C" void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda,
+                        double *b, int *ldb, int *info);
+extern "C" void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda,
+                        float *b, int *ldb, int *info);
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -105,7 +147,19 @@ extern void *lapack_dso_handle;
   __macro(dgeev_);                   \
   __macro(sgeev_);                   \
   __macro(zgeev_);                   \
-  __macro(cgeev_);
+  __macro(cgeev_);                   \
+  __macro(dgels_);                   \
+  __macro(sgels_);                   \
+  __macro(dgelsd_);                  \
+  __macro(sgelsd_);                  \
+  __macro(dgelsy_);                  \
+  __macro(sgelsy_);                  \
+  __macro(dgelss_);                  \
+  __macro(sgelss_);                  \
+  __macro(zpotrs_);                  \
+  __macro(cpotrs_);                  \
+  __macro(dpotrs_);                  \
+  __macro(spotrs_);
 
 LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);
 
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 24a4e5aad04f6..8f917e4904ffe 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -37,6 +37,10 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
 NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
 #endif
 
+#if NCCL_VERSION_CODE >= 21100
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index ea6daf15b9197..f0679b2bce11e 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -76,6 +76,13 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 NCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
+#if NCCL_VERSION_CODE >= 21100
+#define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
+  __macro(ncclRedOpCreatePreMulSum);                \
+  __macro(ncclRedOpDestroy);
+NCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 530ae6ba79889..30930897ea8ca 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -33,6 +33,7 @@ limitations under the License. */
 #include <cudnn.h>
 #include <cufft.h>
 #include <curand.h>
+#include <cusparse.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
 #include "paddle/fluid/platform/external_error.pb.h"
@@ -707,6 +708,7 @@ DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA);
 DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND);
 DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN);
 DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS);
+DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS, CUSPARSE);
 DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER);
 DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT);
 DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS, CU);
@@ -750,6 +752,10 @@ inline const char* GetErrorMsgUrl(T status) {
       break;
     case platform::proto::ApiType::CUFFT:
       return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult";
+    case platform::proto::ApiType::CUSPARSE:
+      return "https://docs.nvidia.com/cuda/cusparse/"
+             "index.html#cusparseStatus_t";
+      break;
     default:
       return "Unknown type of External API, can't get error message URL!";
       break;
@@ -837,6 +843,7 @@ template std::string GetExternalErrorMsg<cudaError_t>(cudaError_t);
 template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
 template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
 template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
+template std::string GetExternalErrorMsg<cusparseStatus_t>(cusparseStatus_t);
 template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
 template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
 template std::string GetExternalErrorMsg<CUresult>(CUresult);
@@ -889,6 +896,17 @@ inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
   return sout.str();
 }
 
+/*************** CUSPARSE ERROR ***************/
+inline bool is_error(cusparseStatus_t stat) {
+  return stat != CUSPARSE_STATUS_SUCCESS;
+}
+
+inline std::string build_nvidia_error_msg(cusparseStatus_t stat) {
+  std::ostringstream sout;
+  sout << "CUSparse error(" << stat << "). " << GetExternalErrorMsg(stat);
+  return sout.str();
+}
+
 /*************** CUSOLVER ERROR ***************/
 inline bool is_error(cusolverStatus_t stat) {
   return stat != CUSOLVER_STATUS_SUCCESS;
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 0d1eee316846c..919266575e6ce 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
 #include <map>
 #include <string>
 #include <utility>
@@ -45,9 +46,9 @@ class Event {
   Event(EventType type, std::string name, uint32_t thread_id,
         EventRole role = EventRole::kOrdinary, std::string attr = "none");
 
-  const EventType& type() const;
-  Event* parent() const { return parent_; }
-  void set_parent(Event* parent) { parent_ = parent; }
+  const EventType &type() const;
+  Event *parent() const { return parent_; }
+  void set_parent(Event *parent) { parent_ = parent; }
   std::string name() const { return name_; }
   EventRole role() const { return role_; }
   uint64_t thread_id() const { return thread_id_; }
@@ -61,13 +62,13 @@ class Event {
 #endif
 #endif
 
-  double CpuElapsedMs(const Event& e) const;
-  double CudaElapsedMs(const Event& e) const;
+  double CpuElapsedMs(const Event &e) const;
+  double CudaElapsedMs(const Event &e) const;
 
  private:
   EventType type_;
   std::string name_{};
-  Event* parent_{nullptr};
+  Event *parent_{nullptr};
   uint64_t thread_id_;
   EventRole role_{};
   int64_t cpu_ns_;
@@ -90,13 +91,13 @@ class Event {
 #endif
 };
 
-using EventWithStartNs = std::pair<Event*, uint64_t>;
+using EventWithStartNs = std::pair<Event *, uint64_t>;
 using ThreadEvents = std::map<uint64_t, EventWithStartNs>;
 
 class MemEvent {
  public:
   MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
-           Place place, int64_t thread_id, const std::string& annotation)
+           Place place, int64_t thread_id, const std::string &annotation)
       : type_(type),
         start_ns_(start_ns),
         end_ns_(end_ns),
@@ -105,13 +106,13 @@ class MemEvent {
         thread_id_(thread_id),
         annotation_(annotation) {}
 
-  const EventType& type() const { return type_; }
+  const EventType &type() const { return type_; }
   uint64_t start_ns() const { return start_ns_; }
   uint64_t end_ns() const { return end_ns_; }
   size_t bytes() const { return bytes_; }
   Place place() const { return place_; }
   uint64_t thread_id() const { return thread_id_; }
-  const std::string& annotation() const { return annotation_; }
+  const std::string &annotation() const { return annotation_; }
 
  private:
   EventType type_;
@@ -151,7 +152,7 @@ class CudaEvent {
 #endif
   }
 
-  void Record(const paddle::platform::stream::CUDAStream& stream) {
+  void Record(const paddle::platform::stream::CUDAStream &stream) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
 #else
@@ -200,5 +201,39 @@ class CudaEvent {
 #endif
 };
 
+struct CommonEvent {
+ public:
+  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
+              EventRole role)
+      : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
+
+  CommonEvent(std::function<void *(size_t)> &arena_allocator,
+              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+              EventRole role, const std::string &attr_str)
+      : start_ns(start_ns), end_ns(end_ns), role(role) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+    buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
+    strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
+    attr = buf;
+  }
+
+  CommonEvent(const std::function<void *(size_t)> &arena_allocator,
+              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+              EventRole role)
+      : start_ns(start_ns), end_ns(end_ns), role(role) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+  }
+
+  const char *name = nullptr;  // not owned, designed for performance
+  uint64_t start_ns = 0;
+  uint64_t end_ns = 0;
+  EventRole role = EventRole::kOrdinary;
+  const char *attr = nullptr;  // not owned, designed for performance
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/event_tracing.h b/paddle/fluid/platform/event_tracing.h
new file mode 100644
index 0000000000000..f68b4b5162a9f
--- /dev/null
+++ b/paddle/fluid/platform/event_tracing.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/platform/event.h"
+
+namespace paddle {
+namespace platform {
+
+// CPU event tracing. A trace marks something that happens but has no duration
+// associated with it. For example, thread starts working.
+// Chrome Trace Viewer Format: Instant Event
+struct RecordInstantEvent {
+  explicit RecordInstantEvent(const char* name,
+                              const EventRole role = EventRole::kOrdinary);
+};
+
+// CPU event tracing. A trace starts when an object of this clas is created and
+// stops when the object is destroyed.
+// Chrome Trace Viewer Format: Duration Event/Complte Event
+class RecordEvent {
+ public:
+  explicit RecordEvent(const std::string& name,
+                       const EventRole role = EventRole::kOrdinary);
+
+  explicit RecordEvent(const char* name,
+                       const EventRole role = EventRole::kOrdinary);
+
+  RecordEvent(const std::string& name, const EventRole role,
+              const std::string& attr);
+
+  // Stop event tracing explicitly before the object goes out of scope.
+  // Sometimes it's inconvenient to use RAII
+  void End();
+
+  ~RecordEvent() { End(); }
+
+ private:
+  void OriginalConstruct(const std::string& name, const EventRole role,
+                         const std::string& attr);
+
+  bool is_enabled_{false};
+  bool is_pushed_{false};
+  // Event name
+  std::string* name_{nullptr};
+  const char* shallow_copy_name_{nullptr};
+  uint64_t start_ns_;
+  // Need to distinguish name by op type, block_id, program_id and perhaps
+  // different kernel invocations within an op.
+  // std::string full_name_;
+  EventRole role_{EventRole::kOrdinary};
+  std::string* attr_{nullptr};
+  bool finished_{false};
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/external_error.proto b/paddle/fluid/platform/external_error.proto
index fcbbb4162612d..8861c2c2ff4fb 100644
--- a/paddle/fluid/platform/external_error.proto
+++ b/paddle/fluid/platform/external_error.proto
@@ -26,6 +26,7 @@ enum ApiType {
   NCCL = 5;
   CUFFT = 6;
   CU = 7;
+  CUSPARSE = 8;
 }
 
 message MessageDesc {
@@ -45,4 +46,4 @@ message AllMessageDesc {
 message ExternalErrorDesc {
   // Error messages of different kind of external third party API
   repeated AllMessageDesc errors = 1;
-}
\ No newline at end of file
+}
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 2df3d00dc924a..8b117a5a8292d 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -694,6 +694,18 @@ PADDLE_DEFINE_EXPORTED_bool(
 PADDLE_DEFINE_EXPORTED_bool(run_pten_kernel, true,
                             "It controls whether to use pten kernel");
 
+/**
+ * Pt kernel related FLAG
+ * Name: FLAGS_run_kp_kernel
+ * Since Version: 2.3.0
+ * Value Range: bool, default=false
+ * Example: FLAGS_run_kp_kernel=true would use the kp kernel to compute in
+ * the Op for XPU2.
+ * Note:
+ */
+PADDLE_DEFINE_EXPORTED_bool(run_kp_kernel, true,
+                            "It controls whether to use kp kernel for xpu2");
+
 /**
  * Distributed related FLAG
  * Name: FLAGS_allreduce_record_one_event
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index bdd4d54b3d1a1..b6d088421af27 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -214,7 +214,7 @@ struct PADDLE_ALIGN(2) float16 {
 
 // Conversion opertors
 #ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline explicit operator half() const {
+  HOSTDEVICE inline half to_half() const {
 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
@@ -233,7 +233,7 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif
 
-  HOSTDEVICE inline explicit operator float() const {
+  HOSTDEVICE inline operator float() const {
 #if defined(PADDLE_CUDA_FP16) && \
     (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = *reinterpret_cast<const half*>(this);
@@ -302,7 +302,7 @@ struct PADDLE_ALIGN(2) float16 {
     return static_cast<uint64_t>(static_cast<float>(*this));
   }
 
-  HOSTDEVICE inline explicit operator double() const {
+  HOSTDEVICE inline operator double() const {
     return static_cast<double>(static_cast<float>(*this));
   }
 
@@ -350,7 +350,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
   return __hadd(a, b);
 #else
   float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -359,7 +359,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
   return __hsub(a, b);
 #else
   float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -368,7 +368,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
   return __hmul(a, b);
 #else
   float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -379,7 +379,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
   return __float2half(num / denom);
 #else
   float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -388,7 +388,7 @@ DEVICE inline half operator-(const half& a) {
   return __hneg(a);
 #else
   float res = -static_cast<float>(float16(a));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -470,7 +470,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 // in __host__ __device__ function
 #if defined(__HIPCC__)
 DEVICE inline float16 operator+(const float16& a, const float16& b) {
-  return float16(__hadd(half(a), half(b)));
+  return float16(__hadd(a.to_half(), b.to_half()));
 }
 HOST inline float16 operator+(const float16& a, const float16& b) {
   return float16(static_cast<float>(a) + static_cast<float>(b));
@@ -478,7 +478,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) {
 #else
 HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hadd(half(a), half(b)));
+  return float16(__hadd(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) + static_cast<float>(b));
 #endif
@@ -487,7 +487,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline float16 operator-(const float16& a, const float16& b) {
-  return float16(__hsub(half(a), half(b)));
+  return float16(__hsub(a.to_half(), b.to_half()));
 }
 HOST inline float16 operator-(const float16& a, const float16& b) {
   return float16(static_cast<float>(a) - static_cast<float>(b));
@@ -495,7 +495,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) {
 #else
 HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hsub(half(a), half(b)));
+  return float16(__hsub(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) - static_cast<float>(b));
 #endif
@@ -504,7 +504,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline float16 operator*(const float16& a, const float16& b) {
-  return float16(__hmul(half(a), half(b)));
+  return float16(__hmul(a.to_half(), b.to_half()));
 }
 HOST inline float16 operator*(const float16& a, const float16& b) {
   return float16(static_cast<float>(a) * static_cast<float>(b));
@@ -512,7 +512,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) {
 #else
 HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hmul(half(a), half(b)));
+  return float16(__hmul(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) * static_cast<float>(b));
 #endif
@@ -521,7 +521,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline float16 operator/(const float16& a, const float16& b) {
-  return float16(__hdiv(half(a), half(b)));
+  return float16(__hdiv(a.to_half(), b.to_half()));
 }
 HOST inline float16 operator/(const float16& a, const float16& b) {
   return float16(static_cast<float>(a) / static_cast<float>(b));
@@ -530,8 +530,8 @@ HOST inline float16 operator/(const float16& a, const float16& b) {
 HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   // TODO(kexinzhao): check which cuda version starts to support __hdiv
-  float num = __half2float(half(a));
-  float denom = __half2float(half(b));
+  float num = __half2float(a.to_half());
+  float denom = __half2float(b.to_half());
   return float16(num / denom);
 #else
   return float16(static_cast<float>(a) / static_cast<float>(b));
@@ -541,7 +541,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline float16 operator-(const float16& a) {
-  return float16(__hneg(half(a)));
+  return float16(__hneg(a.to_half()));
 }
 HOST inline float16 operator-(const float16& a) {
   float16 res;
@@ -551,7 +551,7 @@ HOST inline float16 operator-(const float16& a) {
 #else
 HOSTDEVICE inline float16 operator-(const float16& a) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hneg(half(a)));
+  return float16(__hneg(a.to_half()));
 #else
   float16 res;
   res.x = a.x ^ 0x8000;
@@ -584,7 +584,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
 // in __host__ __device__ function
 #if defined(__HIPCC__)
 DEVICE inline bool operator==(const float16& a, const float16& b) {
-  return __heq(half(a), half(b));
+  return __heq(a.to_half(), b.to_half());
 }
 HOST inline bool operator==(const float16& a, const float16& b) {
   return static_cast<float>(a) == static_cast<float>(b);
@@ -592,7 +592,7 @@ HOST inline bool operator==(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __heq(half(a), half(b));
+  return __heq(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) == static_cast<float>(b);
 #endif
@@ -601,7 +601,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator!=(const float16& a, const float16& b) {
-  return __hne(half(a), half(b));
+  return __hne(a.to_half(), b.to_half());
 }
 HOST inline bool operator!=(const float16& a, const float16& b) {
   return static_cast<float>(a) != static_cast<float>(b);
@@ -609,7 +609,7 @@ HOST inline bool operator!=(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hne(half(a), half(b));
+  return __hne(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) != static_cast<float>(b);
 #endif
@@ -618,7 +618,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator<(const float16& a, const float16& b) {
-  return __hlt(half(a), half(b));
+  return __hlt(a.to_half(), b.to_half());
 }
 HOST inline bool operator<(const float16& a, const float16& b) {
   return static_cast<float>(a) < static_cast<float>(b);
@@ -626,7 +626,7 @@ HOST inline bool operator<(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hlt(half(a), half(b));
+  return __hlt(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) < static_cast<float>(b);
 #endif
@@ -635,7 +635,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator<=(const float16& a, const float16& b) {
-  return __hle(half(a), half(b));
+  return __hle(a.to_half(), b.to_half());
 }
 HOST inline bool operator<=(const float16& a, const float16& b) {
   return static_cast<float>(a) <= static_cast<float>(b);
@@ -643,7 +643,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hle(half(a), half(b));
+  return __hle(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) <= static_cast<float>(b);
 #endif
@@ -652,7 +652,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator>(const float16& a, const float16& b) {
-  return __hgt(half(a), half(b));
+  return __hgt(a.to_half(), b.to_half());
 }
 HOST inline bool operator>(const float16& a, const float16& b) {
   return static_cast<float>(a) > static_cast<float>(b);
@@ -660,7 +660,7 @@ HOST inline bool operator>(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hgt(half(a), half(b));
+  return __hgt(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) > static_cast<float>(b);
 #endif
@@ -669,7 +669,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator>=(const float16& a, const float16& b) {
-  return __hge(half(a), half(b));
+  return __hge(a.to_half(), b.to_half());
 }
 HOST inline bool operator>=(const float16& a, const float16& b) {
   return static_cast<float>(a) >= static_cast<float>(b);
@@ -677,7 +677,7 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hge(half(a), half(b));
+  return __hge(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) >= static_cast<float>(b);
 #endif
@@ -945,12 +945,12 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
 // HIPCC has compile error if call __device__ function __hisnan in __host__
 // __device__ function
 #if defined(PADDLE_CUDA_FP16) && defined(__HIPCC__)
-DEVICE inline bool(isnan)(const float16& a) { return __hisnan(half(a)); }
+DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); }
 HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; }
 #else
 HOSTDEVICE inline bool(isnan)(const float16& a) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hisnan(half(a));
+  return __hisnan(a.to_half());
 #else
   return (a.x & 0x7fff) > 0x7c00;
 #endif
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 75e35d398c27e..8be774441fe7c 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -48,8 +48,8 @@ limitations under the License. */
     in1 = reinterpret_cast<half *>(malloc(size));                             \
     in2 = reinterpret_cast<half *>(malloc(size));                             \
     out = reinterpret_cast<half *>(malloc(size));                             \
-    in1[0] = half(float16(v_in1));                                            \
-    in2[0] = half(float16(v_in2));                                            \
+    in1[0] = float16(v_in1).to_half();                                        \
+    in2[0] = float16(v_in2).to_half();                                        \
     hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                       \
     hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                       \
     hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
@@ -73,8 +73,8 @@ limitations under the License. */
     hipMalloc(reinterpret_cast<void **>(&d_in2), size);                \
     in1 = reinterpret_cast<half *>(malloc(size));                      \
     in2 = reinterpret_cast<half *>(malloc(size));                      \
-    in1[0] = half(float16(v_in1));                                     \
-    in2[0] = half(float16(v_in2));                                     \
+    in1[0] = float16(v_in1).to_half();                                 \
+    in2[0] = float16(v_in2).to_half();                                 \
     hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                \
     hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                \
     hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2); \
@@ -99,8 +99,8 @@ limitations under the License. */
     in1 = reinterpret_cast<half *>(malloc(size));                             \
     in2 = reinterpret_cast<half *>(malloc(size));                             \
     out = reinterpret_cast<bool *>(malloc(1));                                \
-    in1[0] = half(float16(v_in1));                                            \
-    in2[0] = half(float16(v_in2));                                            \
+    in1[0] = float16(v_in1).to_half();                                        \
+    in2[0] = float16(v_in2).to_half();                                        \
     hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                       \
     hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                       \
     hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
@@ -126,8 +126,8 @@ limitations under the License. */
     in1 = reinterpret_cast<half *>(malloc(size));             \
     in2 = reinterpret_cast<half *>(malloc(size));             \
     out = reinterpret_cast<half *>(malloc(size));             \
-    in1[0] = half(float16(v_in1));                            \
-    in2[0] = half(float16(v_in2));                            \
+    in1[0] = float16(v_in1).to_half();                        \
+    in2[0] = float16(v_in2).to_half();                        \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
@@ -151,8 +151,8 @@ limitations under the License. */
     cudaMalloc(reinterpret_cast<void **>(&d_in2), size);      \
     in1 = reinterpret_cast<half *>(malloc(size));             \
     in2 = reinterpret_cast<half *>(malloc(size));             \
-    in1[0] = half(float16(v_in1));                            \
-    in2[0] = half(float16(v_in2));                            \
+    in1[0] = float16(v_in1).to_half();                        \
+    in2[0] = float16(v_in2).to_half();                        \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2);                          \
@@ -177,8 +177,8 @@ limitations under the License. */
     in1 = reinterpret_cast<half *>(malloc(size));            \
     in2 = reinterpret_cast<half *>(malloc(size));            \
     out = reinterpret_cast<bool *>(malloc(1));               \
-    in1[0] = half(float16(v_in1));                           \
-    in2[0] = half(float16(v_in2));                           \
+    in1[0] = float16(v_in1).to_half();                       \
+    in2[0] = float16(v_in2).to_half();                       \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
     op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
@@ -221,7 +221,7 @@ void TestNeg(float v_in, float v_out) {
   cudaMalloc(reinterpret_cast<void **>(&d_in), size);
 #endif
   in = reinterpret_cast<half *>(malloc(size));
-  in[0] = half(float16(v_in));
+  in[0] = float16(v_in).to_half();
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(d_in, in, size, hipMemcpyHostToDevice);
 #else
@@ -299,17 +299,17 @@ TEST(float16, comparision_on_gpu) {
 
 TEST(float16, conversion_on_gpu) {
   // Explicit conversion to and from cuda half
-  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
-  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
-  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
-  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
-  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
-  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
-  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+  EXPECT_EQ(float16(float16(1.0f).to_half()).x, 0x3c00);
+  EXPECT_EQ(float16(float16(0.5f).to_half()).x, 0x3800);
+  EXPECT_EQ(float16(float16(0.33333f).to_half()).x, 0x3555);
+  EXPECT_EQ(float16(float16(0.0f).to_half()).x, 0x0000);
+  EXPECT_EQ(float16(float16(-0.0f).to_half()).x, 0x8000);
+  EXPECT_EQ(float16(float16(65504.0f).to_half()).x, 0x7bff);
+  EXPECT_EQ(float16(float16(65536.0f).to_half()).x, 0x7c00);
 
   // Assignment operator
   float16 v_assign;
-  v_assign = half(float16(1.0f));
+  v_assign = float16(1.0f).to_half();
   EXPECT_EQ(v_assign.x, 0x3c00);
 }
 
diff --git a/paddle/fluid/platform/host_event_recorder.cc b/paddle/fluid/platform/host_event_recorder.cc
new file mode 100644
index 0000000000000..750f39118d7d9
--- /dev/null
+++ b/paddle/fluid/platform/host_event_recorder.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/host_event_recorder.h"
+#include "paddle/fluid/platform/os_info.h"
+
+namespace paddle {
+namespace platform {
+
+ThreadEventRecorder::ThreadEventRecorder() {
+  thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid();
+  HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
+}
+
+HostEventSection HostEventRecorder::GatherEvents() {
+  HostEventSection host_sec;
+  host_sec.thr_sections.reserve(thread_recorders_.size());
+  for (auto &kv : thread_recorders_) {
+    host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
+  }
+  return std::move(host_sec);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/host_event_recorder.h b/paddle/fluid/platform/host_event_recorder.h
new file mode 100644
index 0000000000000..e8dd59ad4c6f1
--- /dev/null
+++ b/paddle/fluid/platform/host_event_recorder.h
@@ -0,0 +1,261 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/platform/event.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename HeadType, typename... RestTypes>
+struct ContainsStdString
+    : std::conditional_t<
+          std::is_same<std::string, std::remove_cv_t<std::remove_reference_t<
+                                        HeadType>>>::value,
+          std::true_type, ContainsStdString<RestTypes...>> {};
+
+template <typename TailType>
+struct ContainsStdString<TailType>
+    : std::is_same<std::string,
+                   std::remove_cv_t<std::remove_reference_t<TailType>>> {};
+
+template <typename EventType>
+class EventContainer {
+ public:
+  EventContainer() {
+    event_blocks_ = cur_event_block_ = new EventBlock;
+    str_blocks_ = cur_str_block_ = new StringBlock;
+  }
+  ~EventContainer() {
+    Reduce();
+    delete event_blocks_;
+    for (auto cur = str_blocks_; cur != nullptr;) {
+      auto next = cur->next;
+      delete cur;
+      cur = next;
+    }
+  }
+  DISABLE_COPY_AND_ASSIGN(EventContainer);
+
+ public:
+  // Record an event
+  template <typename... Args>
+  void Record(Args &&... args) {
+    DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
+  }
+
+  // Get all events and clear the container
+  std::vector<EventType> Reduce();
+
+  // Return a buffer to store the string attribute of Event.
+  // HostEventRecorder locates in the static data section.
+  // So it's safe to use arena to avoid fragmented allocations.
+  char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); }
+
+ private:
+  struct EventBlock {
+    union InitDeferedEvent {
+      InitDeferedEvent() {}
+      ~InitDeferedEvent() {}
+
+      EventType event;
+    };
+
+    static constexpr size_t kBlockSize = 1 << 24;  // 16 MB
+    static constexpr size_t kAvailSize =
+        kBlockSize - sizeof(size_t) - sizeof(nullptr);
+    static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent);
+    static constexpr size_t kPadSize =
+        kAvailSize - kNumEvents * sizeof(InitDeferedEvent);
+    static constexpr size_t kMinimumEventsPerBlock = 1024;
+    static_assert(
+        kNumEvents >= kMinimumEventsPerBlock,
+        "EventType is too large for kBlockSize, make kBlockSize larger");
+
+    size_t offset = 0;
+    EventBlock *next = nullptr;
+    InitDeferedEvent events[kNumEvents];
+    char padding[kPadSize];
+  };
+  static_assert(sizeof(EventBlock) == EventBlock::kBlockSize,
+                "sizeof EventBlock must equal to kBlockSize");
+
+  struct StringBlock {
+    static constexpr size_t kBlockSize = 1 << 22;  // 4 MB
+    static constexpr size_t kAvailSize =
+        kBlockSize - sizeof(size_t) - sizeof(nullptr);
+
+    size_t offset = 0;
+    StringBlock *next = nullptr;
+    char storage[kAvailSize];
+  };
+  static_assert(sizeof(StringBlock) == StringBlock::kBlockSize,
+                "sizeof StringBlock must equal to kBlockSize");
+
+  // Record an event with string arguments
+  template <typename... Args>
+  void DoRecord(std::true_type, Args &&... args) {
+    auto *storage = GetEventStorage();
+    std::function<void *(size_t)> allocator = [this](size_t size) {
+      return GetStrBufFromArena(size);
+    };
+    new (storage) EventType(allocator, std::forward<Args>(args)...);
+  }
+
+  // Record an event without any string argument
+  template <typename... Args>
+  void DoRecord(std::false_type, Args &&... args) {
+    auto *storage = GetEventStorage();
+    new (storage) EventType(std::forward<Args>(args)...);
+  }
+
+  EventType *GetEventStorage();
+
+  char *GetStringStorage(size_t sz);
+
+  EventBlock *event_blocks_ = nullptr;
+  EventBlock *cur_event_block_ = nullptr;
+  StringBlock *str_blocks_ = nullptr;
+  StringBlock *cur_str_block_ = nullptr;
+};
+
+template <typename EventType>
+std::vector<EventType> EventContainer<EventType>::Reduce() {
+  std::vector<EventType> all_events;
+  size_t event_cnt = 0;
+  for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) {
+    event_cnt += cur->offset;
+  }
+  all_events.reserve(event_cnt);
+  for (auto cur = event_blocks_; cur != nullptr;) {
+    for (size_t i = 0; i < cur->offset; ++i) {
+      all_events.emplace_back(cur->events[i].event);
+    }
+    auto next = cur->next;
+    delete cur;
+    cur = next;
+  }
+  event_blocks_ = cur_event_block_ = new EventBlock;
+  return std::move(all_events);
+}
+
+template <typename EventType>
+EventType *EventContainer<EventType>::GetEventStorage() {
+  if (UNLIKELY(cur_event_block_->offset >=
+               EventBlock::kNumEvents)) {  // another block
+    cur_event_block_->next = new EventBlock;
+    cur_event_block_ = cur_event_block_->next;
+  }
+  auto &obj = cur_event_block_->events[cur_event_block_->offset].event;
+  ++cur_event_block_->offset;
+  return &obj;
+}
+
+template <typename EventType>
+char *EventContainer<EventType>::GetStringStorage(size_t sz) {
+  if (UNLIKELY(cur_str_block_->offset + sz >
+               StringBlock::kAvailSize)) {  // another block
+    cur_str_block_->next = new StringBlock;
+    cur_str_block_ = cur_str_block_->next;
+  }
+  char *storage = cur_str_block_->storage + cur_str_block_->offset;
+  cur_str_block_->offset += sz;
+  return storage;
+}
+
+struct ThreadEventSection {
+  std::string thread_name;
+  uint64_t thread_id;
+  std::vector<CommonEvent> events;
+};
+
+class ThreadEventRecorder {
+ public:
+  ThreadEventRecorder();
+  DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
+
+ public:
+  // Forward call to EventContainer::Record
+  template <typename... Args>
+  void RecordEvent(Args &&... args) {
+    base_evt_cntr_.Record(std::forward<Args>(args)...);
+  }
+
+  ThreadEventSection GatherEvents() {
+    ThreadEventSection thr_sec;
+    thr_sec.thread_name = thread_name_;
+    thr_sec.thread_id = thread_id_;
+    thr_sec.events = std::move(base_evt_cntr_.Reduce());
+    return std::move(thr_sec);
+  }
+
+ private:
+  uint64_t thread_id_;
+  std::string thread_name_;
+  EventContainer<CommonEvent> base_evt_cntr_;
+};
+
+struct HostEventSection {
+  std::string process_name;
+  uint64_t process_id;
+  std::vector<ThreadEventSection> thr_sections;
+};
+
+class HostEventRecorder {
+ public:
+  // singleton
+  static HostEventRecorder &GetInstance() {
+    static HostEventRecorder instance;
+    return instance;
+  }
+
+  // If your string argument has a longer lifetime than the Event,
+  // use 'const char*'. e.g.: string literal, op name, etc.
+  // Do your best to avoid using 'std::string' as the argument type.
+  // It will cause deep-copy to harm performance.
+  template <typename... Args>
+  void RecordEvent(Args &&... args) {
+    GetThreadLocalRecorder().RecordEvent(std::forward<Args>(args)...);
+  }
+
+  // Poor performance, call it at the ending
+  HostEventSection GatherEvents();
+
+  void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) {
+    const std::lock_guard<std::mutex> guard(thread_recorders_lock_);
+    thread_recorders_[tid] = recorder;
+  }
+
+ private:
+  HostEventRecorder() = default;
+  DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
+
+  ThreadEventRecorder &GetThreadLocalRecorder() {
+    static thread_local ThreadEventRecorder tls_recorder;
+    return tls_recorder;
+  }
+
+  std::mutex thread_recorders_lock_;
+  std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index b98ca33285a39..7a528cf8d6be1 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -269,9 +269,13 @@ inline dnnl::memory::format_tag GetMKLDNNFormat(dnnl::memory::desc mem_desc) {
     if (inner_nblks == 0) {
       if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
           strides[2] >= strides[3] && strides[3] >= strides[4]) {
-        return dnnl::memory::format_tag::ncdhw;
-      } else {
-        return dnnl::memory::format_tag::ndhwc;
+        return dnnl::memory::format_tag::abcde;
+      } else if (strides[0] >= strides[2] && strides[2] >= strides[1] &&
+                 strides[1] >= strides[3] && strides[3] >= strides[4]) {
+        return dnnl::memory::format_tag::acbde;
+      } else if (strides[0] >= strides[2] && strides[2] >= strides[3] &&
+                 strides[3] >= strides[4] && strides[4] >= strides[1]) {
+        return dnnl::memory::format_tag::acdeb;
       }
     } else if (inner_nblks == 1) {
       if (inner_blks[0] == 8 && inner_idxs[0] == 0) {
@@ -310,6 +314,10 @@ inline dnnl::memory::format_tag GetMKLDNNFormat(dnnl::memory::desc mem_desc) {
           strides[2] >= strides[3] && strides[3] >= strides[4] &&
           strides[4] >= strides[5]) {
         return dnnl::memory::format_tag::abcdef;
+      } else if (strides[0] >= strides[2] && strides[2] >= strides[1] &&
+                 strides[1] >= strides[3] && strides[3] >= strides[4] &&
+                 strides[4] >= strides[5]) {
+        return dnnl::memory::format_tag::acbdef;
       }
     }
   }
@@ -397,7 +405,9 @@ inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size,
       return MKLDNNMemoryFormat::ndhwc;
     }
   } else if (dims_size == 6) {
-    return MKLDNNMemoryFormat::abcdef;
+    if (data_format == MKLDNNMemoryFormat::nchw) {
+      return MKLDNNMemoryFormat::abcdef;
+    }
   }
   return data_format;
 }
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 9af89645f54ea..5ba7f1d144e12 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -14,17 +14,32 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/os_info.h"
 #include <sstream>
-#include "paddle/fluid/platform/device_tracer.h"
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+#elif defined(_MSC_VER)
+#include <processthreadsapi.h>
+#endif
 
 namespace paddle {
 namespace platform {
 
 ThreadId::ThreadId() {
+  // C++ std tid
   std_tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
+// system tid
+#if defined(__linux__)
+  sys_tid_ = syscall(SYS_gettid);
+#elif defined(_MSC_VER)
+  sys_tid_ = GetCurrentThreadId();
+#else  // unsupported platforms
+  sys_tid_ = 0;
+#endif
+  // cupti tid
   std::stringstream ss;
   ss << std::this_thread::get_id();
   cupti_tid_ = static_cast<uint32_t>(std::stoull(ss.str()));
-  RecoreCurThreadId(MainTid());  // For DeviceTracer
 }
 
 ThreadIdRegistry::~ThreadIdRegistry() {
diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h
index b243429fd5a89..c38198f91b36b 100644
--- a/paddle/fluid/platform/os_info.h
+++ b/paddle/fluid/platform/os_info.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <mutex>
 #include <thread>
 #include <unordered_map>
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/enforce.h"  // import LIKELY
+#include "paddle/fluid/platform/macros.h"   // import DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/port.h"
 #ifdef _POSIX_C_SOURCE
 #include <time.h>
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-// Get current time in nanoseconds
+// Get system-wide realtime clock in nanoseconds
 inline uint64_t PosixInNsec() {
 #ifdef _POSIX_C_SOURCE
   struct timespec tp;
@@ -45,13 +45,13 @@ class ThreadId {
  public:
   ThreadId();
 
-  uint64_t MainTid() const { return std_tid_; }
+  uint64_t MainTid() const { return SysTid(); }
 
   uint64_t StdTid() const { return std_tid_; }
 
   uint32_t CuptiTid() const { return cupti_tid_; }
 
-  uint64_t SysTid() const { return sys_tid_; }
+  uint64_t SysTid() const { return sys_tid_ != 0 ? sys_tid_ : std_tid_; }
 
  private:
   uint64_t std_tid_ = 0;    // std::hash<std::thread::id>
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 1bedd5b130844..eaa77273c8fd4 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/host_event_recorder.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler_helper.h"
 #ifdef PADDLE_WITH_CUDA
@@ -36,286 +37,6 @@ DEFINE_bool(enable_host_event_recorder_hook, false,
 namespace paddle {
 namespace platform {
 
-struct DurationEvent {
- public:
-  DurationEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
-                EventRole role)
-      : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
-
-  DurationEvent(std::function<void *(size_t)> &arena_allocator,
-                const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-                EventRole role, const std::string &attr_str)
-      : start_ns(start_ns), end_ns(end_ns), role(role) {
-    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
-    strncpy(buf, name_str.c_str(), name_str.length() + 1);
-    name = buf;
-    buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
-    strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
-    attr = buf;
-  }
-
-  DurationEvent(const std::function<void *(size_t)> &arena_allocator,
-                const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-                EventRole role)
-      : start_ns(start_ns), end_ns(end_ns), role(role) {
-    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
-    strncpy(buf, name_str.c_str(), name_str.length() + 1);
-    name = buf;
-  }
-
-  const char *name = nullptr;  // not owned, designed for performance
-  uint64_t start_ns = 0;
-  uint64_t end_ns = 0;
-  EventRole role = EventRole::kOrdinary;
-  const char *attr = nullptr;  // not owned, designed for performance
-};
-
-template <typename HeadType, typename... RestTypes>
-struct ContainsStdString
-    : std::conditional_t<
-          std::is_same<std::string, std::remove_cv_t<std::remove_reference_t<
-                                        HeadType>>>::value,
-          std::true_type, ContainsStdString<RestTypes...>> {};
-
-template <typename TailType>
-struct ContainsStdString<TailType>
-    : std::is_same<std::string,
-                   std::remove_cv_t<std::remove_reference_t<TailType>>> {};
-
-template <typename EventType>
-class EventContainer {
- public:
-  EventContainer() {
-    event_blocks_ = cur_event_block_ = new EventBlock;
-    str_blocks_ = cur_str_block_ = new StringBlock;
-  }
-  ~EventContainer() {
-    Reduce();
-    delete event_blocks_;
-    for (auto cur = str_blocks_; cur != nullptr;) {
-      auto next = cur->next;
-      delete cur;
-      cur = next;
-    }
-  }
-  DISABLE_COPY_AND_ASSIGN(EventContainer);
-
- public:
-  // Record an event
-  template <typename... Args>
-  void Record(Args &&... args) {
-    DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
-  }
-
-  // Get all events and clear the container
-  std::vector<EventType> Reduce();
-
-  // Return a buffer to store the string attribute of Event.
-  // HostEventRecorder locates in the static data section.
-  // So it's safe to use arena to avoid fragmented allocations.
-  char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); }
-
- private:
-  struct EventBlock {
-    union InitDeferedEvent {
-      InitDeferedEvent() {}
-      ~InitDeferedEvent() {}
-
-      EventType event;
-    };
-
-    static constexpr size_t kBlockSize = 1 << 24;  // 16 MB
-    static constexpr size_t kAvailSize =
-        kBlockSize - sizeof(size_t) - sizeof(nullptr);
-    static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent);
-    static constexpr size_t kPadSize =
-        kAvailSize - kNumEvents * sizeof(InitDeferedEvent);
-    static constexpr size_t kMinimumEventsPerBlock = 1024;
-    static_assert(
-        kNumEvents >= kMinimumEventsPerBlock,
-        "EventType is too large for kBlockSize, make kBlockSize larger");
-
-    size_t offset = 0;
-    EventBlock *next = nullptr;
-    InitDeferedEvent events[kNumEvents];
-    char padding[kPadSize];
-  };
-  static_assert(sizeof(EventBlock) == EventBlock::kBlockSize,
-                "sizeof EventBlock must equal to kBlockSize");
-
-  struct StringBlock {
-    static constexpr size_t kBlockSize = 1 << 22;  // 4 MB
-    static constexpr size_t kAvailSize =
-        kBlockSize - sizeof(size_t) - sizeof(nullptr);
-
-    size_t offset = 0;
-    StringBlock *next = nullptr;
-    char storage[kAvailSize];
-  };
-  static_assert(sizeof(StringBlock) == StringBlock::kBlockSize,
-                "sizeof StringBlock must equal to kBlockSize");
-
-  // Record an event with string arguments
-  template <typename... Args>
-  void DoRecord(std::true_type, Args &&... args) {
-    auto *storage = GetEventStorage();
-    std::function<void *(size_t)> allocator = [this](size_t size) {
-      return GetStrBufFromArena(size);
-    };
-    new (storage) EventType(allocator, std::forward<Args>(args)...);
-  }
-
-  // Record an event without any string argument
-  template <typename... Args>
-  void DoRecord(std::false_type, Args &&... args) {
-    auto *storage = GetEventStorage();
-    new (storage) EventType(std::forward<Args>(args)...);
-  }
-
-  EventType *GetEventStorage();
-
-  char *GetStringStorage(size_t sz);
-
-  EventBlock *event_blocks_ = nullptr;
-  EventBlock *cur_event_block_ = nullptr;
-  StringBlock *str_blocks_ = nullptr;
-  StringBlock *cur_str_block_ = nullptr;
-};
-
-template <typename EventType>
-std::vector<EventType> EventContainer<EventType>::Reduce() {
-  std::vector<EventType> all_events;
-  size_t event_cnt = 0;
-  for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) {
-    event_cnt += cur->offset;
-  }
-  all_events.reserve(event_cnt);
-  for (auto cur = event_blocks_; cur != nullptr;) {
-    for (size_t i = 0; i < cur->offset; ++i) {
-      all_events.emplace_back(cur->events[i].event);
-    }
-    auto next = cur->next;
-    delete cur;
-    cur = next;
-  }
-  event_blocks_ = cur_event_block_ = new EventBlock;
-  return std::move(all_events);
-}
-
-template <typename EventType>
-EventType *EventContainer<EventType>::GetEventStorage() {
-  if (UNLIKELY(cur_event_block_->offset >=
-               EventBlock::kNumEvents)) {  // another block
-    cur_event_block_->next = new EventBlock;
-    cur_event_block_ = cur_event_block_->next;
-  }
-  auto &obj = cur_event_block_->events[cur_event_block_->offset].event;
-  ++cur_event_block_->offset;
-  return &obj;
-}
-
-template <typename EventType>
-char *EventContainer<EventType>::GetStringStorage(size_t sz) {
-  if (UNLIKELY(cur_str_block_->offset + sz >
-               StringBlock::kAvailSize)) {  // another block
-    cur_str_block_->next = new StringBlock;
-    cur_str_block_ = cur_str_block_->next;
-  }
-  char *storage = cur_str_block_->storage + cur_str_block_->offset;
-  cur_str_block_->offset += sz;
-  return storage;
-}
-
-struct ThreadEventSection {
-  std::string thread_name;
-  uint64_t thread_id;
-  std::vector<DurationEvent> events;
-};
-
-class ThreadEventRecorder {
- public:
-  ThreadEventRecorder();
-  DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
-
- public:
-  // Forward call to EventContainer::Record
-  template <typename... Args>
-  void RecordEvent(Args &&... args) {
-    base_evt_cntr_.Record(std::forward<Args>(args)...);
-  }
-
-  ThreadEventSection GatherEvents() {
-    ThreadEventSection thr_sec;
-    thr_sec.thread_name = thread_name_;
-    thr_sec.thread_id = thread_id_;
-    thr_sec.events = std::move(base_evt_cntr_.Reduce());
-    return std::move(thr_sec);
-  }
-
- private:
-  uint64_t thread_id_;
-  std::string thread_name_;
-  EventContainer<DurationEvent> base_evt_cntr_;
-};
-
-struct HostEventSection {
-  std::string process_name;
-  uint64_t process_id;
-  std::vector<ThreadEventSection> thr_sections;
-};
-
-class HostEventRecorder {
- public:
-  // singleton
-  static HostEventRecorder &GetInstance() {
-    static HostEventRecorder instance;
-    return instance;
-  }
-
-  // If your string argument has a longer lifetime than the Event,
-  // use 'const char*'. e.g.: string literal, op name, etc.
-  // Do your best to avoid using 'std::string' as the argument type.
-  // It will cause deep-copy to harm performance.
-  template <typename... Args>
-  void RecordEvent(Args &&... args) {
-    GetThreadLocalRecorder().RecordEvent(std::forward<Args>(args)...);
-  }
-
-  // Poor performance, call it at the ending
-  HostEventSection GatherEvents();
-
-  void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) {
-    const std::lock_guard<std::mutex> guard(thread_recorders_lock_);
-    thread_recorders_[tid] = recorder;
-  }
-
- private:
-  HostEventRecorder() = default;
-  DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
-
-  ThreadEventRecorder &GetThreadLocalRecorder() {
-    static thread_local ThreadEventRecorder tls_recorder;
-    return tls_recorder;
-  }
-
-  std::mutex thread_recorders_lock_;
-  std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_;
-};
-
-ThreadEventRecorder::ThreadEventRecorder() {
-  thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid();
-  HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
-}
-
-HostEventSection HostEventRecorder::GatherEvents() {
-  HostEventSection host_sec;
-  host_sec.thr_sections.reserve(thread_recorders_.size());
-  for (auto &kv : thread_recorders_) {
-    host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
-  }
-  return std::move(host_sec);
-}
-
 MemEvenRecorder MemEvenRecorder::recorder;
 
 Event::Event(EventType type, std::string name, uint32_t thread_id,
@@ -416,7 +137,11 @@ void RecordEvent::OriginalConstruct(const std::string &name,
   *name_ = e->name();
 }
 
-RecordEvent::~RecordEvent() {
+void RecordEvent::End() {
+  if (UNLIKELY(finished_)) {
+    return;
+  }
+  finished_ = true;
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook && is_pushed_) {
@@ -456,6 +181,15 @@ RecordEvent::~RecordEvent() {
   delete attr_;
 }
 
+RecordInstantEvent::RecordInstantEvent(const char *name, const EventRole role) {
+  if (UNLIKELY(FLAGS_enable_host_event_recorder_hook == false)) {
+    return;
+  }
+  auto start_end_ns = PosixInNsec();
+  HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns,
+                                               role);
+}
+
 void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
                                     size_t size) {
   if (g_state == ProfilerState::kDisabled) return;
@@ -740,8 +474,9 @@ std::string PrintHostEvents() {
   for (const auto &thr_evt_sec : host_evt_sec.thr_sections) {
     oss << thr_evt_sec.thread_id << std::endl;
     for (const auto &evt : thr_evt_sec.events) {
-      oss << "{ " << evt.name << " | " << evt.start_ns << " | " << evt.end_ns
-          << " }" << std::endl;
+      oss << "{ " << evt.name << " | " << evt.start_ns << "ns | " << evt.end_ns
+          << "ns | " << (evt.end_ns - evt.start_ns) / 1000.000 << "us }"
+          << std::endl;
     }
   }
   return oss.str();
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 9d0bdf2358900..41cc3805f44da 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -27,9 +27,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/fluid/platform/event_tracing.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.pb.h"
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -127,43 +127,6 @@ struct MemEvenRecorder {
   DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
 };
 
-struct RecordEvent {
-  explicit RecordEvent(const std::string& name,
-                       const EventRole role = EventRole::kOrdinary);
-
-  explicit RecordEvent(const char* name,
-                       const EventRole role = EventRole::kOrdinary);
-
-  RecordEvent(const std::string& name, const EventRole role,
-              const std::string& attr);
-
-  ~RecordEvent();
-
-  void OriginalConstruct(const std::string& name, const EventRole role,
-                         const std::string& attr);
-
-  bool is_enabled_{false};
-  bool is_pushed_{false};
-  // Event name
-  std::string* name_{nullptr};
-  const char* shallow_copy_name_{nullptr};
-  uint64_t start_ns_;
-  // Need to distinguish name by op type, block_id, program_id and perhaps
-  // different kernel invocations within an op.
-  // std::string full_name_;
-  EventRole role_{EventRole::kOrdinary};
-  std::string* attr_{nullptr};
-};
-
-/*class RecordRPCEvent {
- public:
-  explicit RecordRPCEvent(const std::string& name);
-  ~RecordRPCEvent() {}
-
- private:
-  std::unique_ptr<RecordEvent> event_;
-};*/
-
 struct RecordBlock {
   explicit RecordBlock(int block_id);
   ~RecordBlock();
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index dafb61fe0aaf4..742d267b59543 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -56,7 +56,7 @@ void CUDAStream::Destroy() {
   CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device);
   Wait();
   WaitCallback();
-  if (stream_) {
+  if (stream_ && owned_stream_) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
 #else
@@ -92,6 +92,20 @@ void CUDAStream::Wait() const {
   PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
 }
 
+// Note: Can only be used under thread_local semantics.
+void CUDAStream::SetStream(gpuStream_t stream) {
+  if (owned_stream_ && stream_) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
+#endif
+  }
+  owned_stream_ = false;
+  stream_ = stream;
+  callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
+}
+
 CUDAStream* get_current_stream(int deviceId) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (deviceId == -1) {
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 36f31c46673b2..0683cf4b0424e 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream_callback_manager.h"
@@ -130,8 +131,12 @@ class CUDAStream final {
 
   const Place& GetPlace() const { return place_; }
 
+  // Note: Can only be used under thread_local semantics.
+  void SetStream(gpuStream_t stream);
+
  private:
   Place place_;
+  bool owned_stream_{true};
 #ifdef PADDLE_WITH_HIP
   hipStream_t stream_{nullptr};
 #else
diff --git a/paddle/fluid/platform/stream/stream.h b/paddle/fluid/platform/stream/stream.h
new file mode 100644
index 0000000000000..79ca51220bd6c
--- /dev/null
+++ b/paddle/fluid/platform/stream/stream.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+namespace paddle {
+namespace platform {
+
+using StreamId = uint64_t;
+class Stream final {
+ public:
+  explicit Stream(StreamId id) : id_(id) {}
+  StreamId id() const { return id_; }
+
+ private:
+  StreamId id_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index f6c54c2397b18..7fce0296d437a 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -13,13 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/stream_callback_manager.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/enforce.h"
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index a6d7352106dbc..6eb71829769dc 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -275,7 +275,7 @@ if(WITH_PYTHON)
   if(NOT ON_INFER)
     cc_library(paddle_eager
     SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node math_cpu linalg_cpu utils_cpu manipulation_cpu accumulation_node global_utils utils python)
+    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node accumulation_node global_utils utils python)
     add_dependencies(paddle_eager eager_codegen)
     add_dependencies(paddle_eager eager_op_function_generator_cmd)
     list(APPEND PYBIND_DEPS paddle_eager)
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 94ff2eb4c1e23..9484d506b20fb 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/utils.h"
@@ -41,6 +42,7 @@ namespace py = ::pybind11;
 
 PyTypeObject* p_eager_tensor_type;
 extern PyTypeObject* g_vartype_pytype;
+extern PyTypeObject* g_framework_tensor_pytype;
 
 PyObject* EagerTensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
   PyObject* obj = type->tp_alloc(type, 0);
@@ -60,6 +62,13 @@ void EmptyEagerTensorInitializer(
     const std::vector<int>& dims = {},
     framework::proto::VarType::Type var_type =
         paddle::framework::proto::VarType::LOD_TENSOR) {
+  auto ddims = paddle::framework::make_ddim(dims);
+  PADDLE_ENFORCE_GE(
+      paddle::framework::product(ddims), 0,
+      paddle::platform::errors::InvalidArgument(
+          "Create Eager Tensor with dims contain minus num is ilegal"
+          "Please check your code and make sure you new a "
+          "eager tensor with fixed shape instead of using -1."));
   self->eager_tensor.set_name(name);
   auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->eager_tensor));
   autograd_meta->SetPersistable(persistable);
@@ -69,9 +78,19 @@ void EmptyEagerTensorInitializer(
     std::shared_ptr<pten::DenseTensor> dense_tensor =
         std::make_shared<pten::DenseTensor>(
             pten::make_intrusive<paddle::experimental::SharedStorage>(place),
-            pten::DenseTensorMeta(pten::TransToPtenDataType(dtype),
-                                  paddle::framework::make_ddim(dims)));
+            pten::DenseTensorMeta(pten::TransToPtenDataType(dtype), ddims));
     self->eager_tensor.set_impl(dense_tensor);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We only support LoDTensor to be constructed by this initializer, "
+        "please check your var type first and make sure you are going to "
+        "construct LoDTensor."));
+  }
+
+  if (!autograd_meta->GetMutableGradNode()) {
+    VLOG(3) << "Tensor(" << name
+            << ") have not GradNode, add GradNodeAccumulation for it.";
+    autograd_meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
   }
 }
 
@@ -142,11 +161,239 @@ void InitEagerTensorWithEagerTensor(EagerTensorObject* self,
   }
 }
 
-// TODO(jiabin): We have to do some ugly work, refactor this method using
-// PyArg_ParseTuple()，PyArg_ParseTupleAndKeywords() and PyArg_Parse() later to
-// support kwargs.
-int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwds) {
-  /** We should have init function with signature:
+void InitEagerTensorWithFrameworkTensor(EagerTensorObject* self,
+                                        const framework::Tensor& src,
+                                        const paddle::platform::Place& place,
+                                        const std::string& name) {
+  self->eager_tensor.set_name(name);
+  if (place == src.place()) {
+    std::shared_ptr<pten::DenseTensor> dense_tensor =
+        std::make_shared<pten::DenseTensor>(
+            pten::make_intrusive<paddle::experimental::SharedStorage>(place),
+            pten::DenseTensorMeta(pten::TransToPtenDataType(src.type()),
+                                  src.dims()));
+    paddle::experimental::ReMakePtenDenseTensor(src, dense_tensor.get());
+    self->eager_tensor.set_impl(dense_tensor);
+    VLOG(4) << "Same place, do ShareDataWith";
+  } else {
+    std::shared_ptr<pten::DenseTensor> dense_tensor =
+        std::make_shared<pten::DenseTensor>(
+            pten::make_intrusive<paddle::experimental::SharedStorage>(
+                src.place()),
+            pten::DenseTensorMeta(pten::TransToPtenDataType(src.type()),
+                                  src.dims()));
+    paddle::experimental::ReMakePtenDenseTensor(src, dense_tensor.get());
+    auto temp = egr::EagerTensor(dense_tensor);
+    self->eager_tensor.set_impl(
+        temp.copy_to(pten::TransToPtenBackend(place), true).impl());
+    VLOG(4) << "Different place, do TensorCopy";
+  }
+  egr::EagerUtils::autograd_meta(&(self->eager_tensor))->SetStopGradient(true);
+  egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor)
+      ->SetPersistable(false);
+}
+
+py::object ParsePyArray(
+    std::unordered_map<std::string, PyObject*> kws_map,
+    std::unordered_map<std::string, Py_ssize_t> kw_order_map, PyObject* args,
+    bool flag_kwargs, Py_ssize_t args_num) {
+  py::object numpy_value = py::object();
+
+  if (kw_order_map["value"] <= args_num) {
+    numpy_value = py::object(
+        py::handle(PyTuple_GET_ITEM(args, kw_order_map["value"] - 1)), true);
+  } else {
+    if (flag_kwargs && kws_map["value"] != NULL) {
+      numpy_value = py::object(py::handle(kws_map["value"]), true);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The first expected arguments is {value: PyArray}, "
+          "but could not parse the first argument {value: PyArray} "
+          "successfully. "
+          "Please check your input first and make sure you are on the right "
+          "way."));
+    }
+  }
+  return numpy_value;
+}
+
+paddle::platform::Place ParsePlace(
+    std::unordered_map<std::string, PyObject*> kws_map,
+    std::unordered_map<std::string, Py_ssize_t> kw_order_map, PyObject* args,
+    bool flag_kwargs, Py_ssize_t args_num) {
+  paddle::platform::Place place =
+      egr::Controller::Instance().GetExpectedPlace();
+
+  if (kw_order_map["place"] <= args_num) {
+    place = CastPyArg2Place(PyTuple_GET_ITEM(args, kw_order_map["place"] - 1),
+                            kw_order_map["place"] - 1);
+  } else {
+    if (flag_kwargs && kws_map["place"] != NULL) {
+      place = CastPyArg2Place(kws_map["place"], 0);
+    } else {
+      // default
+      return place;
+    }
+  }
+  return place;
+}
+
+// boolean arguments: zero_copy, stop_gradient, persistable
+bool ParseBooleanArgs(std::string key,
+                      std::unordered_map<std::string, PyObject*> kws_map,
+                      std::unordered_map<std::string, Py_ssize_t> kw_order_map,
+                      PyObject* args, bool flag_kwargs, Py_ssize_t args_num) {
+  bool res = false;
+  if (key == "stop_gradient") res = true;
+
+  if (kw_order_map[key] <= args_num) {
+    res = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, kw_order_map[key] - 1),
+                                kw_order_map[key] - 1);
+  } else {
+    if (flag_kwargs && kws_map[key] != NULL) {
+      res = CastPyArg2AttrBoolean(kws_map[key], 0);
+    } else {
+      return res;
+    }
+  }
+  return res;
+}
+
+std::string ParseName(std::unordered_map<std::string, PyObject*> kws_map,
+                      std::unordered_map<std::string, Py_ssize_t> kw_order_map,
+                      PyObject* args, bool flag_kwargs, Py_ssize_t args_num) {
+  std::string act_name = "";
+  if (kw_order_map["name"] <= args_num) {
+    PyObject* name_obj = PyTuple_GET_ITEM(args, kw_order_map["name"] - 1);
+    if (name_obj == Py_None) {
+      act_name =
+          egr::Controller::Instance().GenerateUniqueName("generated_tensor");
+    } else {
+      act_name = CastPyArg2AttrString(name_obj, kw_order_map["name"] - 1);
+    }
+  } else {
+    if (flag_kwargs) {
+      if (kws_map["name"] == NULL) {
+        act_name =
+            egr::Controller::Instance().GenerateUniqueName("generated_tensor");
+      } else {
+        act_name = CastPyArg2AttrString(kws_map["name"], 0);
+      }
+    } else {
+      act_name =
+          egr::Controller::Instance().GenerateUniqueName("generated_tensor");
+    }
+  }
+  return act_name;
+}
+
+// initialize EagerTensor by PyArray(first argument is PyArray,
+// mix args and kwargs) automatically.
+void AutoInitEagerTensorByPyArray(
+    EagerTensorObject* py_tensor_ptr,
+    std::unordered_map<std::string, PyObject*> kws_map, PyObject* args,
+    bool flag_kwargs, Py_ssize_t args_num) {
+  // The first argument of the EagerTensor constructor is PyArray,
+  // there are 6 arguments to construct the new EagerTensor,
+  // kw_order_map's key is every arguments of the constructor,
+  // kw_order_map's value is the position of the arguments respectively.
+  // If u want to update this constructor with new arguments,
+  // need to update this map and to add or change related code.
+  std::unordered_map<std::string, Py_ssize_t> kw_order_map{
+      {"value", 1},     {"place", 2}, {"persistable", 3},
+      {"zero_copy", 4}, {"name", 5},  {"stop_gradient", 6}};
+
+  py::object numpy_value = py::object();
+  paddle::platform::Place place =
+      egr::Controller::Instance().GetExpectedPlace();
+  bool persistable = false;
+  bool zero_copy = false;
+  std::string act_name = "";
+  bool stop_gradient = true;
+
+  numpy_value =
+      ParsePyArray(kws_map, kw_order_map, args, flag_kwargs, args_num);
+  place = ParsePlace(kws_map, kw_order_map, args, flag_kwargs, args_num);
+  persistable = ParseBooleanArgs("persistable", kws_map, kw_order_map, args,
+                                 flag_kwargs, args_num);
+  zero_copy = ParseBooleanArgs("zero_copy", kws_map, kw_order_map, args,
+                               flag_kwargs, args_num);
+  act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num);
+  stop_gradient = ParseBooleanArgs("stop_gradient", kws_map, kw_order_map, args,
+                                   flag_kwargs, args_num);
+
+  EmptyEagerTensorInitializer(py_tensor_ptr, act_name, place, persistable,
+                              stop_gradient);
+  InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy);
+}
+
+// initialize EagerTensor by EagerTensor or framework::Tensor (mix args and
+// kwargs) automatically.
+void AutoInitEagerTensorByTensor(
+    EagerTensorObject* py_tensor_ptr,
+    std::unordered_map<std::string, PyObject*> kws_map, PyObject* args,
+    bool flag_kwargs, Py_ssize_t args_num, bool init_by_egr_tensor = true) {
+  // The first argument of the EagerTensor constructor is EagerTensor or
+  // framework Tensor,
+  // there are 3 arguments to construct the new EagerTensor,
+  // kw_order_map's key is every arguments of the constructor,
+  // kw_order_map's value is the position of the arguments respectively.
+  // If u want to update this constructor with new arguments,
+  // need to update this map and to add or change related code.
+  std::unordered_map<std::string, Py_ssize_t> kw_order_map{
+      {"value", 1}, {"place", 2}, {"name", 3}};
+
+  paddle::platform::Place place =
+      egr::Controller::Instance().GetExpectedPlace();
+  std::string act_name = "";
+
+  place = ParsePlace(kws_map, kw_order_map, args, flag_kwargs, args_num);
+  act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num);
+
+  if (init_by_egr_tensor) {
+    egr::EagerTensor src_tensor;
+    if (kw_order_map["value"] <= args_num) {
+      src_tensor = CastPyArg2EagerTensor(
+          PyTuple_GET_ITEM(args, kw_order_map["value"] - 1),
+          kw_order_map["value"] - 1);
+    } else {
+      if (flag_kwargs && kws_map["value"] != NULL) {
+        src_tensor = CastPyArg2EagerTensor(kws_map["value"], 0);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The first expected kwargs is {value: EagerTensor}, "
+            "but could not parse the first argument {value: EagerTensor} "
+            "successfully. "
+            "Please check your input first and make sure you are on the right "
+            "way."));
+      }
+    }
+    InitEagerTensorWithEagerTensor(py_tensor_ptr, src_tensor, place, act_name);
+  } else {
+    // init by framework tensor
+    framework::Tensor src_tensor;
+    if (kw_order_map["value"] <= args_num) {
+      src_tensor = CastPyArg2FrameworkTensor(
+          PyTuple_GET_ITEM(args, kw_order_map["value"] - 1),
+          kw_order_map["value"] - 1);
+    } else {
+      if (flag_kwargs && kws_map["value"] != NULL) {
+        src_tensor = CastPyArg2FrameworkTensor(kws_map["value"], 0);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The first expected arguments is {value: framework::Tensor}, "
+            "but could not parse the first argument {value: framework::Tensor} "
+            "successfully. "
+            "Please check your input first and make sure you are on the right "
+            "way."));
+      }
+    }
+    InitEagerTensorWithFrameworkTensor(py_tensor_ptr, src_tensor, place,
+                                       act_name);
+  }
+}
+
+/** We should have init function with signature:
    * 1.
    * def __init__ ()
    * 2.
@@ -154,9 +401,11 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwds) {
    * ** dtype: paddle::framework::proto::VarType::Type,
    * ** dims: vector<int>,
    * ** name: std::string,
-   * ** type: paddle::framework::proto::VarType::Type,
+   * ** type: paddle::framework::proto::VarType::LodTensor,
    * ** persistable: bool)
-   * 3. (multi-place) (must have first 2 parameter)
+   * 3. (multi-place)
+   * (should have at least one parameter, one parameter equals to case 4, zero
+   * parameter equals to case 1)
    * def __init__ (
    * ** value: ndarray,
    * ** place: paddle::platform::Place,
@@ -170,12 +419,77 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwds) {
    * 5.
    * def __init__ (
    * ** tensor: EagerTensor)
-   * 6. (multi-place) (must have first 2 parameter)
+   * 6. (multi-place)
+   * (should have at least one parameter, one parameter equals to case 5, zero
+   * parameter equals to case 1.)
    * def __init__ (
    * ** tensor: EagerTensor,
    * ** place: paddle::platform::Place,
    * ** name: std::string)
+   * 7. (multi-place) (should have at least one parameter, one parameter similar
+   * to case 5, zero parameter equals to case 1.)
+   * def __init__ (
+   * ** tensor: FrameworkTensor,
+   * ** place: paddle::platform::Place,
+   * ** name: std::string)
    *  **/
+int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
+  // set a flag to record use kwargs or not
+  bool flag_kwargs = false;
+  if (kwargs) flag_kwargs = true;
+
+  // all kwargs
+  PyObject* kw_zero_copy = NULL;
+  PyObject* kw_persistable = NULL;
+  PyObject* kw_stop_gradient = NULL;
+
+  PyObject* kw_value = NULL;  // receive PyArray or EagerTensor
+  PyObject* kw_place = NULL;
+  PyObject* kw_name = NULL;
+  PyObject* kw_dims = NULL;
+  PyObject* kw_dtype = NULL;
+  PyObject* kw_type = NULL;
+
+  // the keywords argument
+  static char* kwlist[] = {
+      const_cast<char*>("value"),       const_cast<char*>("place"),
+      const_cast<char*>("persistable"), const_cast<char*>("zero_copy"),
+      const_cast<char*>("name"),        const_cast<char*>("stop_gradient"),
+      const_cast<char*>("dims"),        const_cast<char*>("dtype"),
+      const_cast<char*>("type"),        NULL};
+
+  // 'O' Store a Python object (without any conversion) in a C object pointer,
+  // '|' Indicates that the remaining arguments in the Python argument list are
+  // optional.
+  // PyArg_ParseTupleAndKeywords can Parse the parameters of a function that
+  // takes both positional and keyword parameters into local variables,
+  // which enhance case2, case3, case4, case5, case6, case7.
+  bool flag_ = PyArg_ParseTupleAndKeywords(
+      args, kwargs, "|OOOOOOOOO", kwlist, &kw_value, &kw_place, &kw_persistable,
+      &kw_zero_copy, &kw_name, &kw_stop_gradient, &kw_dims, &kw_dtype,
+      &kw_type);
+
+  // helper map
+  std::unordered_map<std::string, PyObject*> kws_map{
+      {"value", kw_value},
+      {"place", kw_place},
+      {"persistable", kw_persistable},
+      {"zero_copy", kw_zero_copy},
+      {"name", kw_name},
+      {"stop_gradient", kw_stop_gradient},
+      {"dims", kw_dims},
+      {"dtype", kw_dtype},
+      {"type", kw_type}};
+
+  PADDLE_ENFORCE_EQ(flag_, true,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "Could not parse args and kwargs successfully, "
+                        "please check your input first and make"
+                        "sure you are on the right way. "
+                        "The expected arguments as follow: ("
+                        "value, place, persistable, zero_copy, "
+                        "name, stop_gradient, dims, dtype, type)"));
+
   PADDLE_ENFORCE_NOT_NULL(
       self, paddle::platform::errors::Fatal(
                 "Calling __init__ of Eager Tensor without __new__ is "
@@ -184,10 +498,12 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwds) {
 
   auto py_tensor_ptr = reinterpret_cast<EagerTensorObject*>(self);
 
-  // TODO(jiabin): Only support case 2 for now
   Py_ssize_t args_num = PyTuple_Size(args);
-  switch (args_num) {
-    case (Py_ssize_t)0: {
+  VLOG(6) << " args_num: " << args_num;
+
+  // args_num = 0, means that there is no position arguments.
+  if (args_num == (Py_ssize_t)0) {
+    if (!flag_kwargs) {
       // case 1
       VLOG(6) << "Calling case1's initializer.";
       EmptyEagerTensorInitializer(
@@ -195,154 +511,150 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwds) {
           egr::Controller::Instance().GenerateUniqueName("generated_tensor"),
           egr::Controller::Instance().GetExpectedPlace());
       return 0;
-    }
-    case (Py_ssize_t)1: {
-      // case 4, 5
-      PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
-      if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
-        VLOG(6) << "Calling case4's initializer.";
-        PADDLE_ENFORCE_EQ(
-            pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr), true,
-            paddle::platform::errors::Fatal(
-                "We expected initial parametes list like: \n **value: ndarray. "
-                "But got value with wrong type: %s",
-                reinterpret_cast<PyTypeObject*>(arg0_ptr->ob_type)->tp_name));
-        py::object numpy_value = py::object(py::handle(arg0_ptr), true);
-        EmptyEagerTensorInitializer(
-            py_tensor_ptr,
-            egr::Controller::Instance().GenerateUniqueName("generated_tensor"),
-            egr::Controller::Instance().GetExpectedPlace());
-        InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value,
-                                      /** zero copy **/ false);
-        return 0;
-      } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
-                                                   p_eager_tensor_type))) {
-        VLOG(6) << "Calling case5's initializer.";
-        auto src_tensor = CastPyArg2EagerTensor(arg0_ptr, 0);
-        InitEagerTensorWithEagerTensor(
-            py_tensor_ptr, src_tensor,
-            egr::Controller::Instance().GetExpectedPlace(),
-            egr::Controller::Instance().GenerateUniqueName("generated_tensor"));
-        return 0;
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "We only support construct tensor from numpy value or tensor with "
-            "python args by this initializer, "
-            "please check your input first and make sure you are on the right "
-            "way."));
-      }
-      return 0;
-    }
-    case (Py_ssize_t)2: {
-      PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
-      if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
-        VLOG(6) << "Calling case3's initializer.";
-        PADDLE_ENFORCE_EQ(
-            pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr), true,
-            paddle::platform::errors::Fatal(
-                "We expected initial parametes list like: \n **value: ndarray. "
-                "But got value with wrong type: %s",
-                reinterpret_cast<PyTypeObject*>(arg0_ptr->ob_type)->tp_name));
-        py::object numpy_value = py::object(py::handle(arg0_ptr), true);
-        paddle::platform::Place place =
-            CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
-        EmptyEagerTensorInitializer(
-            py_tensor_ptr,
-            egr::Controller::Instance().GenerateUniqueName("generated_tensor"),
-            place);
-        InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value,
-                                      /** zero copy **/ false);
-        return 0;
-      } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
-                                                   p_eager_tensor_type))) {
-        VLOG(6) << "Calling case6's initializer.";
-        auto src_tensor = CastPyArg2EagerTensor(arg0_ptr, 0);
-        paddle::platform::Place place =
-            CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
-        InitEagerTensorWithEagerTensor(
-            py_tensor_ptr, src_tensor, place,
-            egr::Controller::Instance().GenerateUniqueName("generated_tensor"));
-        return 0;
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "We only support construct tensor from numpy value or tensor with "
-            "python args by this initializer, "
-            "please check your input first and make sure you are on the right "
-            "way."));
-      }
-    }
-    case (Py_ssize_t)3: {
-      PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
-      if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
-        VLOG(6) << "Calling case3's initializer.";
-        PADDLE_ENFORCE_EQ(
-            pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr), true,
-            paddle::platform::errors::Fatal(
-                "We expected initial parametes list like: \n **value: ndarray. "
-                "But got value with wrong type: %s",
-                reinterpret_cast<PyTypeObject*>(arg0_ptr->ob_type)->tp_name));
-        py::object numpy_value = py::object(py::handle(arg0_ptr), true);
-        paddle::platform::Place place =
-            CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
-        bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
-        EmptyEagerTensorInitializer(
-            py_tensor_ptr,
-            egr::Controller::Instance().GenerateUniqueName("generated_tensor"),
-            place, persistable);
-        InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value,
-                                      /** zero copy **/ false);
-        return 0;
-      } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
-                                                   p_eager_tensor_type))) {
-        VLOG(6) << "Calling case6's initializer.";
-        auto src_tensor = CastPyArg2EagerTensor(arg0_ptr, 0);
-        paddle::platform::Place place =
-            CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
+    } else {  // no position args, all arguments are kwargs
+      if (kw_value != NULL) {
+        if (pybind11::detail::npy_api::get().PyArray_Check_(kw_value)) {
+          VLOG(6) << "Calling case3's or case4's initializer";
+          AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args,
+                                       flag_kwargs, args_num);
+          return 0;
+        } else if (PyObject_IsInstance(kw_value, reinterpret_cast<PyObject*>(
+                                                     p_eager_tensor_type))) {
+          VLOG(6) << "Calling case5's or case6's initializer";
+          AutoInitEagerTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                      args_num);
+          return 0;
+        } else if (PyObject_IsInstance(kw_value,
+                                       reinterpret_cast<PyObject*>(
+                                           g_framework_tensor_pytype))) {
+          VLOG(6) << "Calling case7's initializer.";
+          AutoInitEagerTensorByTensor(
+              py_tensor_ptr, kws_map, args, flag_kwargs, args_num,
+              /* false means not init by egr tensor*/ false);
+          return 0;
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Could not parse the first keyword argument successfully, "
+              "the first keyword argument is value, but it should be PyArray "
+              "or EagerTensor or framework::Tensor. "
+              "Please check your input first and make sure you are on the "
+              "right way."));
+        }
+      } else if (kw_dtype != NULL &&
+                 PyObject_IsInstance(
+                     kw_dtype, reinterpret_cast<PyObject*>(g_vartype_pytype))) {
+        VLOG(6) << "Calling case2's initializer";
+
+        PADDLE_ENFORCE_NOT_NULL(
+            kw_dims,
+            paddle::platform::errors::InvalidArgument(
+                "Calling __init__ of Eager Tensor with NULL dims is "
+                "forbidden. Please check your code and make sure you new a "
+                "dims before calling this constructor."));
+
+        PADDLE_ENFORCE_NOT_NULL(
+            kw_name,
+            paddle::platform::errors::InvalidArgument(
+                "Calling __init__ of Eager Tensor with NULL name is "
+                "forbidden. Please check your code and make sure you new a "
+                "name before calling this constructor."));
+
+        PADDLE_ENFORCE_NOT_NULL(
+            kw_dtype,
+            paddle::platform::errors::InvalidArgument(
+                "Calling __init__ of Eager Tensor with NULL dtype is "
+                "forbidden. Please check your code and make sure you new a "
+                "dtype before calling this constructor."));
+
+        PADDLE_ENFORCE_NOT_NULL(
+            kw_persistable,
+            paddle::platform::errors::InvalidArgument(
+                "Calling __init__ of Eager Tensor with NULL persistable is "
+                "forbidden. Please check your code and make sure you new a "
+                "persistable before calling this constructor."));
+
+        paddle::framework::proto::VarType::Type dtype =
+            CastPyArg2ProtoType(kw_dtype, 0);
+        std::vector<int> dims = CastPyArg2VectorOfInt(kw_dims, 0);
+
         std::string act_name = "";
-        PyObject* name_obj = PyTuple_GET_ITEM(args, 2);
-        if (name_obj == Py_None) {
+        if (kw_name == Py_None) {
           act_name = egr::Controller::Instance().GenerateUniqueName(
               "generated_tensor");
         } else {
-          act_name = CastPyArg2AttrString(name_obj, 2);
+          act_name = CastPyArg2AttrString(kw_name, 0);
         }
-        InitEagerTensorWithEagerTensor(py_tensor_ptr, src_tensor, place,
-                                       act_name);
+
+        paddle::framework::proto::VarType::Type var_type =
+            CastPyArg2ProtoType(kw_type, 0);
+        bool persistable = CastPyArg2AttrBoolean(kw_persistable, 0);
+
+        EmptyEagerTensorInitializer(
+            py_tensor_ptr, act_name,
+            egr::Controller::Instance().GetExpectedPlace(), persistable,
+            /* stop_gradient */ true, dtype, dims, var_type);
+
         return 0;
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "We only support construct tensor from numpy value or tensor with "
-            "python args by this initializer, "
-            "please check your input first and make sure you are on the right "
-            "way."));
+            "We not only support construct EagerTensor from numpy value "
+            "or tensor(EagerTensor or framework::Tensor) "
+            "with python kwargs by this initializer, "
+            "but also even support dtype to init a empty EagerTensor. "
+            "Please check your input first and make sure you call the existed "
+            "constructor."));
       }
     }
-    case (Py_ssize_t)4: {
-      VLOG(6) << "Calling case3's initializer.";
-      PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
-      PADDLE_ENFORCE_EQ(
-          pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr), true,
-          paddle::platform::errors::Fatal(
-              "We expected initial parametes list like: \n **value: ndarray, "
-              "\n ** place: paddle::platform::Place, \n ** persistable: bool, "
-              "\n ** zero_copy: bool, \n ** name: std::string, \n ** "
-              "stop_gradient: bool. But got value with wrong type: %s",
-              reinterpret_cast<PyTypeObject*>(arg0_ptr->ob_type)->tp_name));
-      py::object numpy_value =
-          py::object(py::handle(PyTuple_GET_ITEM(args, 0)), true);
-      paddle::platform::Place place =
-          CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
-      bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
-      bool zero_copy = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
-      EmptyEagerTensorInitializer(
-          py_tensor_ptr,
-          egr::Controller::Instance().GenerateUniqueName("generated_tensor"),
-          place, persistable);
-      InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy);
+  } else if (args_num == (Py_ssize_t)1 || args_num == (Py_ssize_t)2 ||
+             args_num == (Py_ssize_t)3) {
+    // 1 to 3 position args, remainting arguments are kwargs
+    PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
+    if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
+      VLOG(6) << "Calling case3's or case4's initializer.";
+      AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                   args_num);
+      return 0;
+    } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
+                                                 p_eager_tensor_type))) {
+      VLOG(6) << "Calling case5's or case6's initializer.";
+      AutoInitEagerTensorByTensor(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                  args_num);
+      return 0;
+    } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
+                                                 g_framework_tensor_pytype))) {
+      VLOG(6) << "Calling case7's initializer.";
+      AutoInitEagerTensorByTensor(
+          py_tensor_ptr, kws_map, args, flag_kwargs, args_num,
+          /* false means not init by egr tensor*/ false);
+      return 0;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "We support construct EagerTensor from numpy value "
+          "or tensor(EagerTensor or framework::Tensor) "
+          "with python args and kwargs by this initializer, "
+          "but the first argument should be PyArray or EagerTensor or "
+          "framework::Tensor. "
+          "Please check your input first and make sure you call the existed "
+          "constructor."));
+    }
+  } else if (args_num == (Py_ssize_t)4) {
+    // 4 position args, remainting arguments are kwargs
+    PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
+    if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
+      VLOG(6) << "Calling case3's or case4's initializer.";
+      AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                   args_num);
       return 0;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Incompatible constructor arguments, "
+          "there are 4 position args and remainting arguments arg kwargs,"
+          "but the first position args should be PyArray. "
+          "Please check your code and make sure the first position args is "
+          "PyArray."));
     }
-    case (Py_ssize_t)5: {
-      // case 2
+  } else if (args_num == (Py_ssize_t)5) {
+    if (!flag_kwargs) {
       PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
       if (PyObject_IsInstance(arg0_ptr,
                               reinterpret_cast<PyObject*>(g_vartype_pytype))) {
@@ -350,7 +662,6 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwds) {
         paddle::framework::proto::VarType::Type dtype =
             CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0);
         std::vector<int> dims =
-
             CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 1), 1);
         std::string act_name = "";
         PyObject* name_obj = PyTuple_GET_ITEM(args, 2);
@@ -368,82 +679,57 @@ int EagerTensorInit(PyObject* self, PyObject* args, PyObject* kwds) {
             egr::Controller::Instance().GetExpectedPlace(), persistable, true,
             dtype, dims, var_type);
         return 0;
-      } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
-                                                   p_eager_tensor_type))) {
-        PADDLE_ENFORCE_EQ(
-            pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr), true,
-            paddle::platform::errors::Fatal(
-                "We expected initial parametes list like: \n **value: ndarray, "
-                "\n ** place: paddle::platform::Place, \n ** persistable: "
-                "bool, \n ** zero_copy: bool, \n ** name: std::string, \n ** "
-                "stop_gradient: bool. But got value with wrong type: %s",
-                reinterpret_cast<PyTypeObject*>(arg0_ptr->ob_type)->tp_name));
-        py::object numpy_value =
-            py::object(py::handle(PyTuple_GET_ITEM(args, 0)), true);
-        paddle::platform::Place place =
-            CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
-        bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
-        bool zero_copy = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
-        std::string act_name = "";
-        PyObject* name_obj = PyTuple_GET_ITEM(args, 4);
-        if (name_obj == Py_None) {
-          act_name = egr::Controller::Instance().GenerateUniqueName(
-              "generated_tensor");
-        } else {
-          act_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 4), 4);
-        }
-        EmptyEagerTensorInitializer(py_tensor_ptr, act_name, place,
-                                    persistable);
-        InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy);
+      } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
+        VLOG(6) << "Calling case3's initializer.";
+        AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                     args_num);
         return 0;
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "We only support construct tensor from numpy value or dtype with "
-            "python args by this initializer, "
-            "please check your input first and make sure you are on the right "
-            "way."));
+            "Incompatible constructor arguments, "
+            "there are only 5 position args,"
+            "but the first position args should be PyArray or dtype. "
+            "Please check your code and make sure you call the existed "
+            "constructor."));
       }
-      return 0;
-    }
-    case (Py_ssize_t)6: {
-      // case 3
-      VLOG(6) << "Calling case3's initializer.";
+    } else {  // five position args, remainting arguments are kwargs
       PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
-      PADDLE_ENFORCE_EQ(
-          pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr), true,
-          paddle::platform::errors::Fatal(
-              "We expected initial parametes list like: \n **value: ndarray, "
-              "\n ** place: paddle::platform::Place, \n ** persistable: bool, "
-              "\n ** zero_copy: bool, \n ** name: std::string, \n ** "
-              "stop_gradient: bool. But got value with wrong type: %s",
-              reinterpret_cast<PyTypeObject*>(arg0_ptr->ob_type)->tp_name));
-      py::object numpy_value =
-          py::object(py::handle(PyTuple_GET_ITEM(args, 0)), true);
-      paddle::platform::Place place =
-          CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1);
-      bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
-      bool zero_copy = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
-      std::string act_name = "";
-      PyObject* name_obj = PyTuple_GET_ITEM(args, 4);
-      if (name_obj == Py_None) {
-        act_name =
-            egr::Controller::Instance().GenerateUniqueName("generated_tensor");
+      if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
+        VLOG(6) << "Calling case3's or case4's initializer";
+        AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                     args_num);
+        return 0;
       } else {
-        act_name = CastPyArg2AttrString(name_obj, 4);
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Incompatible constructor arguments, "
+            "there are 5 position args and remainting arguments are kwargs,"
+            "but the first position args should be PyArray. "
+            "Please check your code and make sure the first position args is "
+            "PyArray."));
       }
-      bool stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5);
-      EmptyEagerTensorInitializer(py_tensor_ptr, act_name, place, persistable,
-                                  stop_gradient);
-      InitEagerTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy);
-      return 0;
     }
-    default: {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Can't not find expected num of args, please check your call, and "
-          "make sure u call the existed constructor."));
-      return 1;
+  } else if (args_num == (Py_ssize_t)6) {
+    if (!flag_kwargs) {
+      // case 3
+      VLOG(6) << "Calling case3's initializer.";
+      AutoInitEagerTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                   args_num);
+      return 0;
+    } else {  // six position args, remainting arguments are kwargs, but this
+              // is not a right way
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Incompatible constructor arguments, "
+          "there are 6 position args and the remainting arguments are kwargs. "
+          "Please check your code and make sure the first position args is "
+          "PyArray."));
     }
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Can't not find expected num of args, please check your call, and "
+        "make sure u call the existed constructor."));
   }
+
+  return 1;
 }
 
 static void eagertensor_dealloc(EagerTensorObject* self) {
@@ -455,6 +741,10 @@ extern struct PyGetSetDef variable_properties[];
 
 extern PyMethodDef variable_methods[];
 
+PyNumberMethods number_methods;
+PySequenceMethods sequence_methods;
+PyMappingMethods mapping_methods;
+
 PyTypeObject eager_tensor_type = {
     PyVarObject_HEAD_INIT(NULL, 0) "core_avx.eager.EagerTensor", /* tp_name */
     sizeof(EagerTensorObject),       /* tp_basicsize */
@@ -465,9 +755,9 @@ PyTypeObject eager_tensor_type = {
     0,                               /* tp_setattr */
     0,                               /* tp_reserved */
     0,                               /* tp_repr */
-    0,                               /* tp_as_number */
-    0,                               /* tp_as_sequence */
-    0,                               /* tp_as_mapping */
+    &number_methods,                 /* tp_as_number */
+    &sequence_methods,               /* tp_as_sequence */
+    &mapping_methods,                /* tp_as_mapping */
     0,                               /* tp_hash  */
     0,                               /* tp_call */
     0,                               /* tp_str */
@@ -512,7 +802,7 @@ void BindEager(pybind11::module* module) {
   p_eager_tensor_type = &eager_tensor_type;
   if (PyType_Ready(&eager_tensor_type) < 0) {
     PADDLE_THROW(platform::errors::Fatal(
-        "Init Paddle erroe in BindEager(PyType_Ready)."));
+        "Init Paddle error in BindEager(PyType_Ready)."));
     return;
   }
 
@@ -522,7 +812,7 @@ void BindEager(pybind11::module* module) {
     Py_DECREF(&eager_tensor_type);
     Py_DECREF(m.ptr());
     PADDLE_THROW(platform::errors::Fatal(
-        "Init Paddle erroe in BindEager(PyModule_AddObject)."));
+        "Init Paddle error in BindEager(PyModule_AddObject)."));
     return;
   }
 
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index b980692d45558..3f8923440be50 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -112,84 +112,6 @@ static PyObject* eager_api_scale(PyObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-static PyObject* eager_api_numpy_to_tensor(PyObject* numpy_data,
-                                           pten::DataType dtype,
-                                           const paddle::platform::Place& place,
-                                           bool stop_gradient) {
-  std::vector<int64_t> vec_dims;
-  auto numpy_shape = pybind11::detail::array_proxy(numpy_data)->dimensions;
-  int rank = pybind11::detail::array_proxy(numpy_data)->nd;
-  for (int i = 0; i < rank; i++) {
-    vec_dims.push_back(static_cast<int64_t>(numpy_shape[i]));
-  }
-  paddle::framework::DDim dims = paddle::framework::make_ddim(vec_dims);
-
-  // TODO(jiabin): Support GPU later
-  auto meta = pten::DenseTensorMeta(dtype, dims);
-  auto holder = std::make_shared<EagerNumpyAllocation>(numpy_data, dtype);
-  auto shared_storage =
-      pten::make_intrusive<paddle::experimental::SharedStorage>(holder, 0);
-  std::shared_ptr<pten::DenseTensor> densetensor(
-      new pten::DenseTensor(std::move(shared_storage), std::move(meta)));
-
-  PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0);
-  if (obj) {
-    auto v = reinterpret_cast<EagerTensorObject*>(obj);
-    new (&(v->eager_tensor)) egr::EagerTensor();
-    v->eager_tensor.set_impl(densetensor);
-    v->eager_tensor.set_name(egr::Controller::Instance().GenerateUniqueName());
-    auto meta = egr::EagerUtils::autograd_meta(&(v->eager_tensor));
-    meta->SetStopGradient(stop_gradient);
-
-    // Created tensor will be leaf tensor
-    // So we append AccumulationNode to it.
-    auto accumulation_node = std::make_shared<egr::GradNodeAccumulation>();
-    meta->SetGradNode(accumulation_node);
-
-    // TODO(jiabin): Shall we increase ref cnt here to make python ref cnt num
-    // correctly?
-  } else {
-    PADDLE_THROW(platform::errors::Fatal(
-        "tp_alloc return null, can not new a PyObject."));
-  }
-
-  return obj;
-}
-
-static PyObject* eager_api_to_tensor(PyObject* self, PyObject* args,
-                                     PyObject* kwargs) {
-  EAGER_TRY
-  // TODO(jiabin): Support Kwargs here
-  PyObject* data = PyTuple_GET_ITEM(args, 0);
-  auto str_dtype = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
-  pten::DataType dtype = pten::String2DataType(str_dtype);
-  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
-  bool stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
-  // TODO(jiabin): Support this when python given name
-  // auto str_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 4), 4);
-
-  if (pybind11::detail::npy_api::get().PyArray_Check_(data)) {
-    return eager_api_numpy_to_tensor(data, dtype, place, stop_gradient);
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Eater to_tensor only support numpy to tensor."));
-    Py_INCREF(Py_None);
-    return Py_None;
-  }
-  EAGER_CATCH_AND_THROW_RETURN_NULL
-}
-
-static PyObject* eager_api_retain_grad_for_tensor(PyObject* self,
-                                                  PyObject* args,
-                                                  PyObject* kwargs) {
-  EAGER_TRY
-  egr::egr_utils_api::RetainGradForTensor(
-      CastPyArg2EagerTensor(PyTuple_GET_ITEM(args, 0), 0));
-  Py_INCREF(Py_None);
-  return Py_None;
-  EAGER_CATCH_AND_THROW_RETURN_NULL
-}
-
 static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
@@ -203,9 +125,29 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
+                                       PyObject* kwargs) {
+  EAGER_TRY
+  egr::EagerTensor& src =
+      reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 0))
+          ->eager_tensor;
+  egr::EagerTensor& dst =
+      reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 1))
+          ->eager_tensor;
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
+  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
+
+  dst = src.copy_to(pten::TransToPtenBackend(place), blocking);
+  egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
+      egr::EagerUtils::autograd_meta(&(src))->StopGradient());
+  egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
+      egr::EagerUtils::autograd_meta(&(src))->Persistable());
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_functions[] = {
-    {"to_tensor", (PyCFunction)(void (*)(void))eager_api_to_tensor,
-     METH_VARARGS | METH_KEYWORDS, NULL},
     {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_set_expected_place",
@@ -214,11 +156,10 @@ PyMethodDef variable_functions[] = {
     {"_get_expected_place",
      (PyCFunction)(void (*)(void))eager_api_get_expected_place,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"retain_grad_for_tensor",
-     (PyCFunction)(void (*)(void))eager_api_retain_grad_for_tensor,
-     METH_VARARGS | METH_KEYWORDS, NULL},
     {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 void BindFunctions(PyObject* module) {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 790969a4b60a6..7f131f9ccd742 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/utils.h"
@@ -26,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/api/include/api.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -38,10 +40,12 @@ extern PyTypeObject* pEagerTensorType;
 static PyObject* eager_tensor_method_numpy(EagerTensorObject* self,
                                            PyObject* args, PyObject* kwargs) {
   EAGER_SYNC_TRY
-  if (!self->eager_tensor.initialized()) {
-    Py_INCREF(Py_None);
-    return Py_None;
-  }
+  PADDLE_ENFORCE_EQ(
+      self->eager_tensor.initialized(), true,
+      platform::errors::InvalidArgument(
+          "Tensor data of %s is Empty that indicates we have null tensor for "
+          "now, please check if it has no data and initialize it first.",
+          self->eager_tensor.name()));
   auto tensor_dims = self->eager_tensor.shape();
   auto numpy_dtype = TensorDtype2NumpyDtype(self->eager_tensor.type());
   auto sizeof_dtype = pten::DataTypeSize(self->eager_tensor.type());
@@ -120,13 +124,106 @@ static PyObject* eager_tensor_method_copy_(EagerTensorObject* self,
   egr::EagerTensor src_tensor =
       CastPyArg2EagerTensor(PyTuple_GET_ITEM(args, 0), 0);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
+  VLOG(6) << "Start Copy Tensor " << src_tensor.name() << " to "
+          << self->eager_tensor.name();
+  if (!self->eager_tensor.defined()) {
+    egr::EagerUtils::autograd_meta(&(self->eager_tensor))
+        ->SetStopGradient(
+            egr::EagerUtils::autograd_meta(&(src_tensor))->StopGradient());
+    egr::EagerUtils::autograd_meta(&(self->eager_tensor))
+        ->SetPersistable(
+            egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
+  }
+
   self->eager_tensor.copy_(src_tensor, blocking);
-  egr::EagerUtils::autograd_meta(&(self->eager_tensor))
-      ->SetStopGradient(
-          egr::EagerUtils::autograd_meta(&(src_tensor))->StopGradient());
-  egr::EagerUtils::autograd_meta(&(self->eager_tensor))
-      ->SetPersistable(
-          egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
+
+  VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
+          << self->eager_tensor.name();
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_tensor_retain_grads(EagerTensorObject* self,
+                                           PyObject* args, PyObject* kwargs) {
+  EAGER_TRY
+  if (egr::Controller::Instance().HasGrad()) {
+    auto meta = egr::EagerUtils::autograd_meta(&(self->eager_tensor));
+    if (!meta->GetMutableGradNode()) {
+      VLOG(6) << "Make grad node of tensor: " << self->eager_tensor.name()
+              << "become accumulation node";
+      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+    }
+    egr::egr_utils_api::RetainGradForTensor(self->eager_tensor);
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_tensor__clear_gradient(EagerTensorObject* self,
+                                              PyObject* args,
+                                              PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  VLOG(4) << "ClearGradient " << self->eager_tensor.name();
+
+  egr::EagerTensor grad;
+  if (egr::egr_utils_api::IsLeafTensor(self->eager_tensor)) {
+    // Add RetainGrad as PostHook to AccumulationNode
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->eager_tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    auto accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    grad = accumulation_grad_node->Grad();
+  } else {
+    auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
+    grad = meta->Grad();
+  }
+
+  if (grad.initialized()) {
+    VLOG(4) << "Gradient of " << self->eager_tensor.name()
+            << " is initialized, will be released.";
+    auto dense_tensor =
+        std::dynamic_pointer_cast<pten::DenseTensor>(grad.impl());
+    dense_tensor->release();
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_tensor__zero_grads(EagerTensorObject* self,
+                                          PyObject* args, PyObject* kwargs) {
+  EAGER_TRY
+  VLOG(4) << "ZeroGrads " << self->eager_tensor.name();
+
+  egr::EagerTensor grad;
+  if (egr::egr_utils_api::IsLeafTensor(self->eager_tensor)) {
+    // Add RetainGrad as PostHook to AccumulationNode
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->eager_tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    auto accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    grad = accumulation_grad_node->Grad();
+  } else {
+    auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
+    grad = meta->Grad();
+  }
+
+  if (grad.initialized()) {
+    grad.set_tensor(std::make_shared<paddle::experimental::Tensor>(
+        paddle::experimental::zeros_like(*(grad.Tensor().get()))));
+  }
   Py_INCREF(Py_None);
   return Py_None;
   EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -142,6 +239,13 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"copy_", (PyCFunction)(void (*)(void))eager_tensor_method_copy_,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"retain_grads", (PyCFunction)(void (*)(void))eager_tensor_retain_grads,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_clear_gradient",
+     (PyCFunction)(void (*)(void))eager_tensor__clear_gradient,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 4e880f78c6d19..3d0a4d0de75bd 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -70,11 +70,17 @@ const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
 const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
 
 const char* CAST_VAR_TEMPLATE = R"(
-    auto %s = GetEagerTensorFromArgs("%s", "%s", args, %d, %s);)";
+    auto& %s = GetEagerTensorFromArgs("%s", "%s", args, %d, %s);)";
 
 const char* CAST_VAR_LIST_TEMPLATE = R"(
     auto %s = GetEagerTensorListFromArgs("%s", "%s", args, %d, %s);)";
 
+const char* CAST_VAR_PTR_TEMPLATE = R"(
+    auto %s = GetEagerTensorPtrFromArgs("%s", "%s", args, %d, %s);)";
+
+const char* CAST_VAR_PTR_LIST_TEMPLATE = R"(
+    auto %s = GetEagerTensorPtrListFromArgs("%s", "%s", args, %d, %s);)";
+
 const char* CAST_SIZE_T_TEMPLATE = R"(
     auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)";
 
@@ -221,8 +227,8 @@ std::string GenerateOpFunctionsBody(
         outs_initializer += ",";
       }
 
-      const auto in_cast_type =
-          output.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+      const auto in_cast_type = output.duplicable() ? CAST_VAR_PTR_LIST_TEMPLATE
+                                                    : CAST_VAR_PTR_TEMPLATE;
       auto dispensable = output.dispensable() ? "true" : "false";
       ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
                                               out_name, arg_idx++, dispensable);
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index b8b7adea50641..b147d5fbad0ed 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -60,8 +62,23 @@ PyObject* eager_tensor_properties_get_stop_gradient(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
                                            void* closure) {
   EAGER_SYNC_TRY
-  auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
-  return ToPyObject(meta->Grad());
+  if (egr::egr_utils_api::IsLeafTensor(self->eager_tensor)) {
+    // Add RetainGrad as PostHook to AccumulationNode
+    std::shared_ptr<egr::GradNodeBase> grad_node =
+        egr::EagerUtils::grad_node(self->eager_tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    auto accumulation_grad_node =
+        std::dynamic_pointer_cast<egr::GradNodeAccumulation>(grad_node);
+    return ToPyObject(accumulation_grad_node->Grad());
+  } else {
+    VLOG(6) << "Get grad for tensor: " << self->eager_tensor.name();
+    auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
+    return ToPyObject(meta->Grad());
+  }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index d9da5102262fe..9849d0d41611b 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -40,6 +40,7 @@ extern PyTypeObject* g_cpuplace_pytype;
 extern PyTypeObject* g_xpuplace_pytype;
 extern PyTypeObject* g_npuplace_pytype;
 extern PyTypeObject* g_cudapinnedplace_pytype;
+extern PyTypeObject* g_framework_tensor_pytype;
 
 int TensorDtype2NumpyDtype(pten::DataType dtype) {
   switch (dtype) {
@@ -199,7 +200,7 @@ std::vector<egr::EagerTensor> CastPyArg2VectorOfEagerTensor(PyObject* obj,
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "argument (position %d) must be "
-            "list of bool, but got %s at pos %d",
+            "list of Tensor, but got %s at pos %d",
             arg_pos + 1,
             reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
       }
@@ -216,11 +217,13 @@ std::vector<egr::EagerTensor> CastPyArg2VectorOfEagerTensor(PyObject* obj,
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "argument (position %d) must be "
-            "list of EagerTensor, but got %s at pos %d",
+            "list of Tensor, but got %s at pos %d",
             arg_pos + 1,
             reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
       }
     }
+  } else if (obj == Py_None) {
+    return {};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
@@ -262,6 +265,8 @@ std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos) {
             reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
       }
     }
+  } else if (obj == Py_None) {
+    return {};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
@@ -300,6 +305,18 @@ platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
   return place;
 }
 
+framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_framework_tensor_pytype))) {
+    return ::pybind11::handle(obj).cast<framework::Tensor>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "EagerTensor, but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+
 paddle::framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                             ssize_t arg_pos) {
   paddle::framework::proto::VarType::Type dtype;
@@ -478,10 +495,10 @@ PyObject* ToPyObject(
   return dict;
 }
 
-egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
-                                        const std::string& arg_name,
-                                        PyObject* args, ssize_t arg_idx,
-                                        bool dispensable) {
+egr::EagerTensor& GetEagerTensorFromArgs(const std::string& op_type,
+                                         const std::string& arg_name,
+                                         PyObject* args, ssize_t arg_idx,
+                                         bool dispensable) {
   PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
 
   if (PyTuple_Check(obj)) {
@@ -494,7 +511,7 @@ egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
           "%s(): argument '%s' (position %d) must be Tensor, but got None",
           op_type, arg_name, arg_idx));
     }
-    egr::EagerTensor emptytensor;
+    static egr::EagerTensor emptytensor;
     return emptytensor;
   }
 
@@ -544,6 +561,8 @@ std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
           reinterpret_cast<EagerTensorObject*>(PyTuple_GetItem(list, i))
               ->eager_tensor);
     }
+  } else if (list == Py_None) {
+    return {};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument '%s' (position %d) must be list of Tensors, but got "
@@ -555,5 +574,83 @@ std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
   return result;
 }
 
+egr::EagerTensor* GetEagerTensorPtrFromArgs(const std::string& op_type,
+                                            const std::string& arg_name,
+                                            PyObject* args, ssize_t arg_idx,
+                                            bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (PyTuple_Check(obj)) {
+    obj = PyTuple_GET_ITEM(obj, 0);
+  }
+
+  if (obj == nullptr || obj == Py_None) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    static egr::EagerTensor emptytensor;
+    return &emptytensor;
+  }
+
+  return &(reinterpret_cast<EagerTensorObject*>(obj)->eager_tensor);
+}
+
+std::vector<egr::EagerTensor*> GetEagerTensorPtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensor, but got "
+          "None",
+          op_type, arg_name, arg_idx));
+    }
+    return {};
+  }
+
+  std::vector<egr::EagerTensor*> result;
+
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(
+          &(reinterpret_cast<EagerTensorObject*>(PyList_GetItem(list, i))
+                ->eager_tensor));
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(
+          &(reinterpret_cast<EagerTensorObject*>(PyTuple_GetItem(list, i))
+                ->eager_tensor));
+    }
+  } else if (list == Py_None) {
+    return {};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        (reinterpret_cast<PyTypeObject*>(list->ob_type))->tp_name));
+  }
+
+  return result;
+}
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index e493e06d7d7df..20c82c572c325 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -35,6 +35,7 @@ egr::EagerTensor CastPyArg2EagerTensor(PyObject* obj, ssize_t arg_pos);
 std::vector<egr::EagerTensor> CastPyArg2VectorOfEagerTensor(PyObject* obj,
                                                             ssize_t arg_pos);
 platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
+framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos);
 std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                     ssize_t arg_pos);
@@ -83,13 +84,21 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
-egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
-                                        const std::string& arg_name,
-                                        PyObject* args, ssize_t arg_idx,
-                                        bool dispensable = false);
+egr::EagerTensor& GetEagerTensorFromArgs(const std::string& op_type,
+                                         const std::string& arg_name,
+                                         PyObject* args, ssize_t arg_idx,
+                                         bool dispensable = false);
 std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
+egr::EagerTensor* GetEagerTensorPtrFromArgs(const std::string& op_type,
+                                            const std::string& arg_name,
+                                            PyObject* args, ssize_t arg_idx,
+                                            bool dispensable = false);
+std::vector<egr::EagerTensor*> GetEagerTensorPtrListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 1d73ecbab5e54..4f22e83ac626f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -763,7 +763,6 @@ void BindImperative(py::module *m_ptr) {
         []() { imperative::SetLoadProcessSignalHandler(); });
   m.def("_throw_error_if_process_failed",
         []() { imperative::ThrowErrorIfLoadProcessFailed(); });
-
   // Dygraph DataLoader reader process & thread related functions
   m.def(
       "_convert_to_tensor_list",
@@ -793,7 +792,7 @@ void BindImperative(py::module *m_ptr) {
           SetTensorFromPyArray<platform::CPUPlace>(&t, array,
                                                    platform::CPUPlace(), true);
           // 3. allocate shared memory
-          void *data_ptr = t.data<void>();
+          void *data_ptr = t.data();
           size_t data_size = t.numel() * framework::SizeOfType(t.type());
           auto shared_writer_holder =
               memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
@@ -828,7 +827,7 @@ void BindImperative(py::module *m_ptr) {
           SetTensorFromPyArray<platform::CPUPlace>(&t, array,
                                                    platform::CPUPlace(), true);
           // 3. allocate shared memory
-          void *data_ptr = t.data<void>();
+          void *data_ptr = t.data();
           size_t data_size = t.numel() * framework::SizeOfType(t.type());
           auto shared_writer_holder =
               memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
@@ -866,7 +865,10 @@ void BindImperative(py::module *m_ptr) {
 
   m.def("start_imperative_gperf_profiler",
         []() { imperative::StartProfile(); });
-
+  m.def("_set_eager_tracer",
+        [](const std::shared_ptr<imperative::Tracer> &tracer) {
+          egr::Controller::Instance().SetCurrentTracer(tracer);
+        });
   m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
 
   m.def("_is_dygraph_debug_enabled",
@@ -876,9 +878,8 @@ void BindImperative(py::module *m_ptr) {
         [](const std::shared_ptr<imperative::Tracer> &tracer) {
           if (egr::Controller::Instance().InEagerMode()) {
             egr::Controller::Instance().SetCurrentTracer(tracer);
-          } else {
-            imperative::SetCurrentTracer(tracer);
           }
+          imperative::SetCurrentTracer(tracer);
         });
   m.def("_enable_eager_mode",
         []() { egr::Controller::Instance().SetInEagerMode(true); });
@@ -1856,7 +1857,7 @@ void BindImperative(py::module *m_ptr) {
              // 1. get LoDTensor
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
              // 2. allocate shared memory
-             void *data_ptr = t->data<void>();
+             void *data_ptr = t->data();
              size_t data_size = t->numel() * framework::SizeOfType(t->type());
              auto shared_writer_holder =
                  memory::allocation::AllocateMemoryMapWriterAllocation(
@@ -1984,6 +1985,7 @@ void BindImperative(py::module *m_ptr) {
                  platform::errors::InvalidArgument(
                      "Tensor %s has not been initialized!", self->Name()));
              dst_->ShareBufferWith(*src);
+             dst_->ShareDataTypeWith(*src);
            })
       .def("_is_shared_buffer_with",
            [](const std::shared_ptr<imperative::VarBase> &self,
@@ -2013,6 +2015,29 @@ void BindImperative(py::module *m_ptr) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
              return t->numel();
            })
+      .def("element_size", &imperative::VarBase::ElementSize, R"DOC(
+        Returns the size in bytes of an element in the Tensor.
+        
+        Examples:
+          .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor(1, dtype='bool')
+            x.element_size() # 1
+
+            x = paddle.to_tensor(1, dtype='float16')
+            x.element_size() # 2
+
+            x = paddle.to_tensor(1, dtype='float32')
+            x.element_size() # 4
+
+            x = paddle.to_tensor(1, dtype='float64')
+            x.element_size() # 8
+
+            x = paddle.to_tensor(1, dtype='complex128')
+            x.element_size() # 16
+       )DOC")
       .def_property("name", &imperative::VarBase::Name,
                     &imperative::VarBase::SetName)
       .def_property("stop_gradient",
@@ -2020,28 +2045,40 @@ void BindImperative(py::module *m_ptr) {
                     &imperative::VarBase::SetOverridedStopGradient)
       .def_property("persistable", &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
-      .def_property_readonly(
-          "shape",
-          [](imperative::VarBase &self) {
-            if (self.Var().IsType<framework::LoDTensor>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::LoDTensor>().dims());
-            } else if (self.Var().IsType<framework::SelectedRows>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::SelectedRows>().value().dims());
-            } else if (self.Var().IsType<framework::Strings>()) {
-              return std::vector<int>{static_cast<int>(
-                  self.Var().Get<framework::Strings>().size())};
-            } else if (self.Var().IsType<framework::Vocab>()) {
-              return std::vector<int>{
-                  static_cast<int>(self.Var().Get<framework::Vocab>().size())};
-            } else {
-              VLOG(2) << "It is meaningless to get shape of "
-                         "variable type "
-                      << GetTypeName(self);
-              return std::vector<int>();
-            }
-          })
+      .def_property_readonly("shape",
+                             [](imperative::VarBase &self) {
+                               if (self.Var().IsType<framework::LoDTensor>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::LoDTensor>()
+                                         .dims());
+                               } else if (self.Var()
+                                              .IsType<
+                                                  framework::SelectedRows>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::SelectedRows>()
+                                         .value()
+                                         .dims());
+                               } else if (self.Var()
+                                              .IsType<framework::Strings>()) {
+                                 return std::vector<int>{static_cast<int>(
+                                     self.Var()
+                                         .Get<framework::Strings>()
+                                         .size())};
+                               } else if (self.Var()
+                                              .IsType<framework::Vocab>()) {
+                                 return std::vector<int>{static_cast<int>(
+                                     self.Var()
+                                         .Get<framework::Vocab>()
+                                         .size())};
+                               } else {
+                                 VLOG(2) << "It is meaningless to get shape of "
+                                            "variable type "
+                                         << GetTypeName(self);
+                                 return std::vector<int>();
+                               }
+                             })
       .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.
@@ -2115,6 +2152,8 @@ void BindImperative(py::module *m_ptr) {
             if (py::isinstance<platform::CUDAPlace>(obj)) {
               auto p = obj.cast<platform::CUDAPlace *>();
               self.SetExpectedPlace(*p);
+              // TODO(jiabin): Support eager here when we need to make all
+              // dygraph in eager mode
               VLOG(4) << "Tracer(" << &self << ")"
                       << " set expected place " << *p;
             } else if (py::isinstance<platform::XPUPlace>(obj)) {
@@ -2189,65 +2228,75 @@ void BindImperative(py::module *m_ptr) {
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs, const platform::XPUPlace &place,
-              bool trace_backward) {
+              bool trace_backward,
+              const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
                self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward);
+                            std::move(attrs), place, trace_backward,
+                            inplace_map);
              }
            })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs, const platform::CUDAPlace &place,
-              bool trace_backward) {
+              bool trace_backward,
+              const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
                self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward);
+                            std::move(attrs), place, trace_backward,
+                            inplace_map);
              }
            })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs, const platform::NPUPlace &place,
-              bool trace_backward) {
+              bool trace_backward,
+              const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
                self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward);
+                            std::move(attrs), place, trace_backward,
+                            inplace_map);
              }
            })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs, const platform::MLUPlace &place,
-              bool trace_backward) {
+              bool trace_backward,
+              const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
                self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward);
+                            std::move(attrs), place, trace_backward,
+                            inplace_map);
              }
            })
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs, const platform::CPUPlace &place,
-              bool trace_backward) {
+              bool trace_backward,
+              const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
                self.TraceOp(type, std::move(ins_map), std::move(outs_map),
-                            std::move(attrs), place, trace_backward);
+                            std::move(attrs), place, trace_backward,
+                            inplace_map);
              }
            });
 
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 5193724ecedf5..5b788caeb12d0 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -742,7 +742,13 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
       .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
       .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
-      .def("run", &paddle_infer::Predictor::Run)
+      .def("run",
+           [](paddle_infer::Predictor &self) {
+#ifdef PADDLE_WITH_ASCEND_CL
+             pybind11::gil_scoped_release release;
+#endif
+             self.Run();
+           })
       .def("clone", &paddle_infer::Predictor::Clone)
       .def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory)
       .def("clear_intermediate_tensor",
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index f2fb4671dffe7..bb45c1c40603f 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -143,6 +143,7 @@ void BindNode(py::module *m) {
       .def("var", &Node::Var, return_value_policy::reference)
       .def("op", &Node::Op, return_value_policy::reference)
       .def("id", &Node::id)
+      .def("original_desc_id", &Node::OriginalDescId)
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
       .def("is_ctrl_var", &Node::IsCtrlVar)
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index c29228f2a5df6..f83997843f433 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -71,6 +71,9 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"adam",
      {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
       "Beta2Pow", "MasterParam"}},
+    {"merged_adam",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
     {"adamw",
      {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
       "Beta2Pow", "MasterParam"}},
@@ -79,6 +82,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
       "Beta2Pow", "MasterParam"}},
     {"sparse_attention",
      {"Q", "K", "V", "Offset", "Columns", "KeyPaddingMask", "AttnMask"}},
+    {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -122,9 +126,13 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"adam",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
+    {"merged_adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
     {"adamw",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
+    {"sgd", {"ParamOut", "MasterParamOut"}},
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
@@ -142,10 +150,13 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
 //     especially in declarative mode.
 // For those OPs, we need to manually specify the outs need to pass in this map.
 std::map<std::string, std::set<std::string>> op_passing_outs_map = {
-    {"sgd", {"ParamOut"}},
+    {"sgd", {"ParamOut", "MasterParamOut"}},
     {"adam",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
+    {"merged_adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
     {"adamw",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 9e5e391920b08..66bf8c95179af 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -179,6 +179,8 @@ void BindVarDsec(pybind11::module *m) {
            pybind11::return_value_policy::reference)
       .def("dtype", &pd::VarDesc::GetDataType,
            pybind11::return_value_policy::reference)
+      .def("element_size", &pd::VarDesc::ElementSize,
+           pybind11::return_value_policy::reference)
       .def("dtypes", &pd::VarDesc::GetDataTypes,
            pybind11::return_value_policy::reference)
       .def("lod_level", &pd::VarDesc::GetLoDLevel)
@@ -206,6 +208,8 @@ void BindVarDsec(pybind11::module *m) {
       .def("_set_attr", &pd::VarDesc::SetAttr)
       .def("remove_attr", &pd::VarDesc::RemoveAttr)
       .def("id", &pd::VarDesc::Id)
+      .def("original_id", &pd::VarDesc::OriginalId)
+      .def("set_original_id", &pd::VarDesc::SetOriginalId)
       .def("attr", &pd::VarDesc::GetAttr);
 
   pybind11::enum_<pd::proto::VarType::Type> vartype(var_desc, "VarType", "");
@@ -303,6 +307,8 @@ void BindOpDesc(pybind11::module *m) {
       .def("block", [](pd::OpDesc &self) { return self.Block(); },
            pybind11::return_value_policy::reference)
       .def("id", &pd::OpDesc::Id)
+      .def("original_id", &pd::OpDesc::OriginalId)
+      .def("set_original_id", &pd::OpDesc::SetOriginalId)
       .def("inputs", &pd::OpDesc::Inputs)
       .def("outputs", &pd::OpDesc::Outputs);
 }
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 6e98a9479fa26..629dc2c4037e7 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -39,6 +39,11 @@ void BindPSGPUWrapper(py::module* m) {
       .def(py::init([]() { return framework::PSGPUWrapper::GetInstance(); }))
       .def("set_slot_vector", &framework::PSGPUWrapper::SetSlotVector,
            py::call_guard<py::gil_scoped_release>())
+      .def("set_slot_dim_vector", &framework::PSGPUWrapper::SetSlotDimVector,
+           py::call_guard<py::gil_scoped_release>())
+      .def("set_slot_offset_vector",
+           &framework::PSGPUWrapper::SetSlotOffsetVector,
+           py::call_guard<py::gil_scoped_release>())
       .def("init_GPU_server", &framework::PSGPUWrapper::InitializeGPUServer,
            py::call_guard<py::gil_scoped_release>())
       .def("set_date", &framework::PSGPUWrapper::SetDate,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f9e327642feb6..925cba86c0139 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -138,6 +138,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
 #endif
@@ -165,6 +169,8 @@ PyTypeObject *g_cpuplace_pytype = nullptr;
 PyTypeObject *g_xpuplace_pytype = nullptr;
 PyTypeObject *g_npuplace_pytype = nullptr;
 PyTypeObject *g_cudapinnedplace_pytype = nullptr;
+PyTypeObject *g_mluplace_pytype = nullptr;
+PyTypeObject *g_framework_tensor_pytype = nullptr;
 
 bool IsCompiledWithCUDA() {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
@@ -230,6 +236,14 @@ bool IsCompiledWithCINN() {
 #endif
 }
 
+bool IsCompiledWithMLU() {
+#ifndef PADDLE_WITH_MLU
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithHETERPS() {
 #ifndef PADDLE_WITH_HETERPS
   return false;
@@ -295,10 +309,9 @@ OpSupportedInfos(const std::string &place,
                  [](unsigned char c) { return std::toupper(c); });
   using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
   std::unordered_map<std::string, fn_type> is_target_place{
-      {"GPU", &platform::is_gpu_place},
-      {"CPU", &platform::is_cpu_place},
-      {"XPU", &platform::is_xpu_place},
-      {"NPU", &platform::is_npu_place},
+      {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
+      {"XPU", &platform::is_xpu_place}, {"NPU", &platform::is_npu_place},
+      {"MLU", &platform::is_mlu_place},
   };
   PADDLE_ENFORCE_NE(
       is_target_place.count(query_place), 0,
@@ -734,7 +747,11 @@ PYBIND11_MODULE(core_noavx, m) {
 
   BindImperative(&m);
 
-  py::class_<framework::Tensor>(m, "Tensor", py::buffer_protocol())
+  py::class_<framework::Tensor> framework_tensor(m, "Tensor",
+                                                 py::buffer_protocol());
+  g_framework_tensor_pytype =
+      reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
+  framework_tensor
       .def("__array__",
            [](framework::Tensor &self) { return TensorToPyArray(self); })
       .def("_is_initialized",
@@ -765,6 +782,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](framework::Tensor &self, paddle::platform::NPUPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("_alloc_double",
            [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<double>(place);
@@ -781,6 +802,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
+             self.mutable_data<int>(place);
+           })
       .def("_alloc_int",
            [](framework::Tensor &self,
               paddle::platform::CUDAPinnedPlace &place) {
@@ -811,6 +836,11 @@ PYBIND11_MODULE(core_noavx, m) {
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
+      .def("_mutable_data",
+           [](framework::Tensor &self, paddle::platform::MLUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
       .def("_clear", &framework::Tensor::clear)
       .def("_mutable_data",
            [](framework::Tensor &self, paddle::platform::NPUPlace &place,
@@ -827,6 +857,8 @@ PYBIND11_MODULE(core_noavx, m) {
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
       .def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
+      .def("_copy_from", &TensorCopyFrom<paddle::platform::MLUPlace>,
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
       .def("_copy_from", &TensorCopyFrom<paddle::platform::Place>,
            py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
@@ -839,6 +871,8 @@ PYBIND11_MODULE(core_noavx, m) {
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::MLUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
            R"DOC(
@@ -846,7 +880,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
+          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1615,6 +1649,18 @@ All parameter, weight, gradient are variables in Paddle.
                  "Please recompile or reinstall Paddle with XPU support."));
 #else
                     return new paddle::platform::XPUDeviceContext(place);
+#endif
+                  })
+        .def_static("create",
+                  [](paddle::platform::MLUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_MLU
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use MLUPlace in CPU/GPU version, "
+                 "Please recompile or reinstall Paddle with MLU support."));
+#else
+                    return new paddle::platform::MLUDeviceContext(place);
 #endif
                   })
         .def_static("create",
@@ -1732,6 +1778,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
       .def("_get_device_id",
@@ -2000,6 +2047,75 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
       .def("__str__", string::to_string<const platform::IPUPlace &>);
 
+  // MLUPlace
+  py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
+    MLUPlace is a descriptor of a device.
+    It represents a MLU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          # required: mlu
+          mlu_place = paddle.MLUPlace(0)
+
+        )DOC");
+  g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
+  mluplace
+      .def("__init__",
+           [](platform::MLUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_MLU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid MLUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
+               if (platform::GetMLUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use MLU because there is no MLU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetMLUDeviceCount(),
+                     platform::GetMLUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::MLUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use MLU because you have installed CPU/GPU/... "
+                 "version "
+                 "PaddlePaddle.\n"
+                 "If you want to use MLU, please try to install MLU version "
+                 "PaddlePaddle by: pip install paddlepaddle-mlu\n"
+                 "If you only have CPU, please change MLUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::MLUPlace>)
+#ifdef PADDLE_WITH_MLU
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
+      .def("get_device_id",
+           [](const platform::MLUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__str__", string::to_string<const platform::MLUPlace &>);
+
   py::class_<platform::Place> platformplace(m, "Place");
   g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
   platformplace.def(py::init<>())
@@ -2011,6 +2127,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
       .def("is_cpu_place",
@@ -2025,6 +2142,8 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
            })
+      .def("is_mlu_place",
+           [](platform::Place &self) { return platform::is_mlu_place(self); })
       .def("gpu_device_id",
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::CUDAPlace, self).device;
@@ -2041,6 +2160,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::IPUPlace, self).device;
            })
+      .def("mlu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::MLUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
@@ -2068,6 +2191,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self, const platform::IPUPlace &ipu_place) {
              self = ipu_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::MLUPlace &mlu_place) {
+             self = mlu_place;
+           })
       .def("__repr__", string::to_string<const platform::Place &>)
       .def("__str__", string::to_string<const platform::Place &>);
 
@@ -2116,6 +2243,12 @@ All parameter, weight, gradient are variables in Paddle.
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::MLUPlace &place) {
+             pybind11::gil_scoped_release release;
+             self.Run(scope, place);
+           })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -2303,6 +2436,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("is_compiled_with_cinn", IsCompiledWithCINN);
+  m.def("is_compiled_with_mlu", IsCompiledWithMLU);
   m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
@@ -2623,6 +2757,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
 #endif
 
+#ifdef PADDLE_WITH_MLU
+  m.def("get_mlu_device_count", platform::GetMLUDeviceCount);
+#endif
+
   py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
       .value("kDefault", platform::TracerOption::kDefault)
       .value("kOpDetail", platform::TracerOption::kOpDetail)
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index df9ba02eadf43..b31b7456ebca7 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -345,6 +345,18 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use NPUPlace in CPU/GPU/XPU version. "
         "Please recompile or reinstall Paddle with NPU support."));
+#endif
+  } else if (paddle::platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_MLU
+    platform::Place tmp_place = place;
+    platform::MLUDeviceGuard guard(
+        BOOST_GET_CONST(platform::MLUPlace, tmp_place).device);
+    auto dst = self->mutable_data<T>(place);
+    paddle::platform::MLUMemcpyH2DSync(dst, array.data(), array.nbytes());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use MLUPlace in CPU/GPU version, "
+        "Please recompile or reinstall Paddle with MLU support."));
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -702,6 +714,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
   bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
   bool is_npu_tensor = platform::is_npu_place(tensor.place());
+  bool is_mlu_tensor = platform::is_mlu_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = tensor.type();
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -716,11 +729,11 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     numel *= py_dims[i];
   }
 
-  const void *tensor_buf_ptr = tensor.data<void>();
+  const void *tensor_buf_ptr = tensor.data();
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
-  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor && !is_mlu_tensor) {
     if (!need_deep_copy) {
       auto base = py::cast(std::move(tensor));
       return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
@@ -816,6 +829,29 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use NPUPlace in CPU/GPU/XPU version, "
         "Please recompile or reinstall Paddle with NPU support."));
+#endif
+  } else if (is_mlu_tensor) {
+#ifdef PADDLE_WITH_MLU
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::MLUPlace, tensor.place());
+    paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
+                         tensor_buf_ptr, copy_bytes, nullptr);
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use MLUPlace in CPU/GPU/XPU/NPU version, "
+        "Please recompile or reinstall Paddle with MLU support."));
 #endif
   }
   PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 465397977fb07..8f05d286bf033 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -2,6 +2,16 @@ if (NOT WITH_INFRT)
     return()
 endif()
 
+# compile flags
+set(INFRT_FLAGS -Wno-comment)
+foreach(flag ${INFRT_FLAGS})
+  safe_set_cflag(CMAKE_C_FLAGS ${flag})
+  safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+endforeach()
+
+set(INFRT_SOURCE_DIR "${PADDLE_SOURCE_DIR}/paddle/infrt" )
+set(INFRT_BINARY_DIR "${PADDLE_BINARY_DIR}/paddle/infrt" )
+set(INFRT_TEST_TARGETS CACHE INTERNAL "")
 include(infrt_lib)
 
 set(infrt_src CACHE INTERNAL "" FORCE)
@@ -44,6 +54,7 @@ function(cc_test_tiny TARGET_NAME)
     if (${cc_test_tiny_SERIAL})
       set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
+    set(INFRT_TEST_TARGETS ${INFRT_TEST_TARGETS} ${TARGET_NAME} CACHE INTERNAL "")
   endif()
 
 endfunction()
@@ -74,9 +85,12 @@ set(infrt_mlir_incs
         dense_tensor_inc
         pd_ops_inc
         rewrite_inc
+        trt_ops_inc
         )
 message(STATUS "infrt srcs:\n${infrt_src}")
 
 cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto)
 cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs})
+
+add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS})
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index c06d777163563..d145843684c63 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -51,3 +51,5 @@ infrt_exec_check(test_infrt_tensor_type mlir_tests/tensor_type.mlir)
 infrt_exec_check(test_infrt__basic mlir_tests/basic.mlir)
 infrt_exec_check(test_infrt_benchmark mlir_tests/benchmark.mlir)
 infrt_exec_check(test_infrt_mlir_dense_tensor mlir_tests/dense_tensor.mlir)
+
+add_subdirectory(tensorrt)
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
index 1398378957069..58acd7c9a409a 100644
--- a/paddle/infrt/dialect/infrt_base.h
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -58,12 +58,13 @@ static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
   return b.getIntegerAttr(b.getI32Type(), constant);
 }
 
-static mlir::ValueRange cvtValueToValueRange(const mlir::Value &operand) {
-  return mlir::ValueRange(operand);
+static mlir::SmallVector<::mlir::Value, 4> cvtValueToValueRange(
+    const mlir::Value &operand) {
+  return mlir::SmallVector<::mlir::Value, 4>(1, operand);
 }
 
-static mlir::ValueRange concatTwoValueRange(mlir::ValueRange operand_0,
-                                            mlir::ValueRange operand_1) {
+static mlir::SmallVector<::mlir::Value, 4> concatTwoValueRange(
+    mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
   mlir::SmallVector<::mlir::Value, 4> operands;
   operands.append(operand_0.begin(), operand_0.end());
   operands.append(operand_1.begin(), operand_1.end());
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 61dcfe5bfb1c3..7d6fdbbbf2f68 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -35,8 +35,4 @@ def INFRT_cvtValueToValueRange : NativeCodeCall<
 
 def INFRT_concatTwoValueRange : NativeCodeCall<
     "mlir::concatTwoValueRange($0, $1)">;
-
-class IsBoolAttrEq<string value> : Constraint<
-    CPred<"($0.getValue() ==" # value # ")">,
-    "Bool attrbute value constraint">;
 #endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index 8df8727dbe2b0..b318a6a763483 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -34,9 +34,11 @@ namespace infrt::dialect {
 
 mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
                                      const std::string& mlir_source) {
-  context->allowUnregisteredDialects();
+  // context->allowUnregisteredDialects();
   RegisterCinnDialects(context->getDialectRegistry());
-  context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
+  // Currenetly, We only used the CinnDialect and mlir::BuiltinDialect is
+  // enough。Don't need StandardOpsDialect.
+  // context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
 
   mlir::ScopedDiagnosticHandler scope_handler(
       context, [](mlir::Diagnostic& diag) {
@@ -54,7 +56,7 @@ mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
 
 mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
                                    mlir::MLIRContext* context) {
-  context->allowUnregisteredDialects();
+  // context->allowUnregisteredDialects();
   RegisterCinnDialects(context->getDialectRegistry());
   context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
 
diff --git a/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir b/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
index 1855a68dd91c3..6618fe66bda0a 100644
--- a/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
+++ b/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
@@ -1,8 +1,8 @@
 func @ops() {
-  %a = pd.Feed() : tensor<?xf32>
-  %b = pd.Feed() : tensor<?xf32>
+  %a = pd.feed() {name="input0"} : tensor<?xf32>
+  %b = pd.feed() {name="input1"}: tensor<?xf32>
 
-  %c = "pd.Matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
 
   infrt.return
 }
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite.mlir b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
index c984fda3e6211..bfad9d1f6924d 100644
--- a/paddle/infrt/dialect/mlir_tests/rewrite.mlir
+++ b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
@@ -1,24 +1,24 @@
 // CHECK-LABEL: @main
 func @main() -> tensor<?xf32> {
-  %a = "pd.Feed"() : () -> tensor<?xf32>
-  %b = "pd.Feed"() : () -> tensor<?xf32>
-  %bias = "pd.Feed"() : () -> tensor<?xf32>
+  %a = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
+  %b = "pd.feed"() {name="input1"} : () -> tensor<?xf32>
+  %bias = "pd.feed"() {name="input2"} : () -> tensor<?xf32>
 
-  %b1 = "pd.Feed"() : () -> tensor<?xf32>
-  %b2 = "pd.Feed"() : () -> tensor<?xf32>
-  %bias1 = "pd.Feed"() : () -> tensor<?xf32>
-  %bias2 = "pd.Feed"() : () -> tensor<?xf32>
+  %b1 = "pd.feed"() {name="input3"} : () -> tensor<?xf32>
+  %b2 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
+  %bias1 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
+  %bias2 = "pd.feed"() {name="input6"} : () -> tensor<?xf32>
 
-  %c = "pd.Matmul"(%a, %b) {transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d = "pd.ElementwiseAdd"(%c, %bias) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e = "pd.Relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c = "pd.matmul"(%a, %b) {transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d = "pd.elementwise_add"(%c, %bias) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
 
-  %c1 = "pd.Matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d1 = "pd.ElementwiseAdd"(%c1, %bias1) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e1 = "pd.Relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
 
-  %c2 = "pd.Matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %d2 = "pd.ElementwiseAdd"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %e2 = "pd.Relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
+  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   infrt.return %e2 : tensor<?xf32>
 }
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
index d41d4b2f9f6bc..9ea1ec0ebca36 100644
--- a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
+++ b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
@@ -1,13 +1,13 @@
 // CHECK-LABEL: @main
 func @main() -> tensor<?xf32> {
-  %a = "pd.Feed"() : () -> tensor<?x3x256x256xf32>
-  %filter = "pd.Constant"(){value = dense<1.000000e+00> : tensor<3x64x3x3xf32>} : () -> tensor<3x64x3x3xf32> 
-  %bias = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %a = "pd.feed"() {name="input0"} : () -> tensor<?x3x256x256xf32>
+  %filter = "pd.constant"(){value = dense<1.000000e+00> : tensor<3x64x3x3xf32>} : () -> tensor<3x64x3x3xf32> 
+  %bias = "pd.constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
 
-  %scale = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
-  %bias2 = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
-  %mean = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
-  %var = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %scale = "pd.constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %bias2 = "pd.constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %mean = "pd.constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %var = "pd.constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
 
   %c = "pd.conv2d"(%a, %filter, %bias) {} : (tensor<?x3x256x256xf32>, tensor<3x64x3x3xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
   %d = "pd.batch_norm"(%c, %scale, %bias2, %mean, %var) {} : (tensor<?x3x256x256xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
diff --git a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
new file mode 100644
index 0000000000000..009b6d1c19653
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
@@ -0,0 +1,22 @@
+// CHECK-LABEL: @main
+func @main() -> tensor<?xf32> {
+  %bias = "pd.feed"() {name="input0"} : () -> tensor<?xf32>
+  %c = "pd.feed"() {name="input1"} : () -> tensor<?xf32>
+  %b1 = "pd.feed"() {name="input2"} : () -> tensor<?xf32>
+  %b2 = "pd.feed"() {name="input3"} : () -> tensor<?xf32>
+  %bias1 = "pd.feed"() {name="input4"} : () -> tensor<?xf32>
+  %bias2 = "pd.feed"() {name="input5"} : () -> tensor<?xf32>
+
+  %d = "pd.elementwise_add"(%c, %bias) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e = "pd.relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
+
+  %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e1 = "pd.relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
+
+  %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
+  
+  "pd.fetch"(%e2) :(tensor<?xf32>)->()
+}
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index 7ca07dd5fcbba..ce10be6d100f8 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -20,11 +20,6 @@
 
 namespace mlir {
 namespace pd {
-
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.hpp.inc"
-#undef GET_OP_CLASSES
-
 PaddleDialect::PaddleDialect(MLIRContext *context)
     : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
   addOperations<
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
index d09b6032257a2..71e0a53988d1a 100644
--- a/paddle/infrt/dialect/pd_ops.h
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -53,5 +53,9 @@ class PaddleDialect : public Dialect {
   }
 };
 
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.hpp.inc"
+#undef GET_OP_CLASSES
+
 }  // namespace pd
 }  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.td b/paddle/infrt/dialect/pd_ops.td
index 9e906ad0c02cc..b020b7ad5dbc7 100644
--- a/paddle/infrt/dialect/pd_ops.td
+++ b/paddle/infrt/dialect/pd_ops.td
@@ -6,14 +6,14 @@ include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/pd_op_base.td"
 
-def PD_FeedOp : PD_Op<"Feed", [NoSideEffect]> {
+def PD_FeedOp : PD_Op<"feed"> {
   let summary = "Feed Op";
 
   let description = [{
     Feed a tensor into the model.
   }];
 
-  let arguments = (ins);
+  let arguments = (ins StrAttr:$name);
   let results = (outs PD_Tensor:$out);
 
   let assemblyFormat = [{
@@ -21,7 +21,27 @@ def PD_FeedOp : PD_Op<"Feed", [NoSideEffect]> {
   }];
 }
 
-def PD_ConstantOp : PD_Op<"Constant", [NoSideEffect, ConstantLike, DeclareOpInterfaceMethods<InferTypeOpInterface>, AllTypesMatch<["value", "output"]>]> {
+def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
+  let summary = "fetch Op";
+
+  let description = [{
+    Fetch tensor from the graph.
+  }];
+
+  let arguments = (ins Variadic<PD_Tensor>:$inputs);
+}
+
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
+  let summary = "paddle graph Op";
+  let description = [{
+    Describe a paddle graph or subgraph.
+  }];
+  let regions = (region SizedRegion<1>:$body);
+  let arguments = (ins Variadic<PD_Tensor>:$inputs);
+  let results = (outs Variadic<PD_Tensor>:$outputs);
+}
+
+def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInterfaceMethods<InferTypeOpInterface>, AllTypesMatch<["value", "output"]>]> {
   let summary = "constant Op";
   let description = [{}];
 
@@ -34,7 +54,7 @@ def PD_ConstantOp : PD_Op<"Constant", [NoSideEffect, ConstantLike, DeclareOpInte
   ];
 }
 
-def PD_AbsOp : PD_Op<"Abs", [NoSideEffect, SameOperandsAndResultType]> {
+def PD_AbsOp : PD_Op<"abs", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the absolute value of a tensor";
 
   let description = [{
@@ -54,7 +74,7 @@ def PD_SqrtOp : PD_Op<"sqrt", [NoSideEffect, SameOperandsAndResultType]> {
   let results = (outs PD_Tensor:$y);
 }
 
-def PD_ReluOp : PD_Op<"Relu", [NoSideEffect, SameOperandsAndResultType]> {
+def PD_ReluOp : PD_Op<"relu", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the Relu of a tensor";
 
   let description = [{
@@ -65,7 +85,7 @@ def PD_ReluOp : PD_Op<"Relu", [NoSideEffect, SameOperandsAndResultType]> {
   let hasCanonicalizer = 1;
 }
 
-def PD_Relu6Op : PD_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
+def PD_Relu6Op : PD_Op<"relu6", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the Relu6 of a tensor";
 
   let description = [{
@@ -75,7 +95,7 @@ def PD_Relu6Op : PD_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
   let results = (outs PD_Tensor:$y);
 }
 
-def PD_ElementwiseAdd : PD_Op<"ElementwiseAdd", [NoSideEffect, Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def PD_ElementwiseAdd : PD_Op<"elementwise_add", [NoSideEffect, Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "ElementwiseAdd Op";
   let description = [{
   }];
@@ -86,7 +106,7 @@ def PD_ElementwiseAdd : PD_Op<"ElementwiseAdd", [NoSideEffect, Commutative, Decl
   let hasFolder = 1;
 }
 
-def PD_ElementwiseSub : PD_Op<"ElementwiseSub", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def PD_ElementwiseSub : PD_Op<"elementwise_sub", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "ElementwiseSub Op";
   let description = [{
   }];
@@ -95,7 +115,7 @@ def PD_ElementwiseSub : PD_Op<"ElementwiseSub", [NoSideEffect, DeclareOpInterfac
   let results = (outs PD_Tensor:$out);
 }
 
-def PD_ElementwiseMul : PD_Op<"ElementwiseMul", [NoSideEffect, Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def PD_ElementwiseMul : PD_Op<"elementwise_mul", [NoSideEffect, Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "ElementwiseMul Op";
   let description = [{
   }];
@@ -104,7 +124,7 @@ def PD_ElementwiseMul : PD_Op<"ElementwiseMul", [NoSideEffect, Commutative, Decl
   let results = (outs PD_Tensor:$out);
 }
 
-def PD_ElementwiseDiv : PD_Op<"ElementwiseDiv", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+def PD_ElementwiseDiv : PD_Op<"elementwise_div", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "ElementwiseDiv Op";
   let description = [{
   }];
@@ -113,7 +133,7 @@ def PD_ElementwiseDiv : PD_Op<"ElementwiseDiv", [NoSideEffect, DeclareOpInterfac
   let results = (outs PD_Tensor:$out);
 }
 
-def PD_MatmulOp : PD_Op<"Matmul", [NoSideEffect]> {
+def PD_MatmulOp : PD_Op<"matmul", [NoSideEffect]> {
   let summary = "Computes the matrix mulplication result of two tensors";
   let description = [{
   }];
@@ -161,7 +181,7 @@ def PD_BatchNormOp : PD_Op<"batch_norm", [NoSideEffect]> {
   let hasCanonicalizer = 1;
 }
 
-def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
+def PD_FusedFC : PD_Op<"fc", [NoSideEffect]> {
     let summary = "Computes the Fully Connected result of two tensors";
     let description = [{
     }];
@@ -170,7 +190,7 @@ def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
     let results = (outs PD_Tensor:$out);
 }
 
-def PD_FusedRepeatedFCRelu : PD_Op<"RepeatedFCRelu", [SameVariadicOperandSize, NoSideEffect]> {
+def PD_FusedRepeatedFCRelu : PD_Op<"fusion_repeated_fc_relu", [SameVariadicOperandSize, NoSideEffect]> {
     let summary = "";
     let description = [{ }];
 
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index 3c5a2b6a7bf90..43a3577b90f10 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -115,7 +115,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "mlir demo");
 
   mlir::MLIRContext *context = infrt::Global::getMLIRContext();
-  context->allowUnregisteredDialects();
+  // context->allowUnregisteredDialects();
   auto &registry = context->getDialectRegistry();
   infrt::RegisterCinnDialects(registry);
 
diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/rewrite.td
index aa81dd72d059b..b5b7cf0667f68 100644
--- a/paddle/infrt/dialect/rewrite.td
+++ b/paddle/infrt/dialect/rewrite.td
@@ -15,13 +15,16 @@ include "paddle/infrt/dialect/pd_ops.td"
 // which corresponds to the following computation:
 //   (FusedFC)  out = x * y + bias
 // 
+// while meeting the following attribute constrait:
+// Matmul: transpose_x: false
+//         transpose_y: false
+//
 // Todo:
 //  1. Make the constrait more completely.
 //  2. Consider the case of : out = bias + z
 //===----------------------------------------------------------------------===//
-def FuseMulAdd : Pat<(PD_ElementwiseAdd (PD_MatmulOp $x, $y, $transpose_x, $transpose_y, $alpha), $bias, $axis),
-                     (PD_FusedFC $x, $y, $bias, (INFRT_createI32Attr<"1">)),
-                     [(IsBoolAttrEq<"false"> $transpose_x),(IsBoolAttrEq<"false"> $transpose_y)]>;
+def FuseMulAdd : Pat<(PD_ElementwiseAdd (PD_MatmulOp $x, $y, ConstBoolAttrFalse:$_, ConstBoolAttrFalse:$_, $alpha), $bias, $axis),
+                     (PD_FusedFC $x, $y, $bias, (INFRT_createI32Attr<"1">))>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/paddle/infrt/dialect/tensorrt/CMakeLists.txt b/paddle/infrt/dialect/tensorrt/CMakeLists.txt
new file mode 100755
index 0000000000000..794266513eb81
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/CMakeLists.txt
@@ -0,0 +1,12 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    trt_ops.cc
+    trt_op_teller_pass.cc
+    trt_graph_fuse_pass.cc
+    trt_graph_split_pass.cc
+    )
+mlir_tablegen_on(trt_ops)
+
+add_executable(trt-exec trt_exec.cc)
+target_link_libraries(trt-exec infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
new file mode 100644
index 0000000000000..dc0f2acb2b733
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include <string>
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Pass/PassManager.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
+#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
+
+int main(int argc, char** argv) {
+  static llvm::cl::opt<std::string> input_file(
+      llvm::cl::Positional,
+      llvm::cl::desc("Specify input filename"),
+      llvm::cl::init("-"));
+
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context);
+
+  module->dump();
+  mlir::PassManager pm(context);
+
+  mlir::OpPassManager& trt_pass_manager = pm.nest<mlir::FuncOp>();
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtOpTellerPass>());
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtGraphFusePass>());
+  trt_pass_manager.addPass(std::make_unique<infrt::trt::trtGraphSplitPass>(10));
+  if (mlir::failed(pm.run(*module))) {
+    std::cout << "\npass failed!\n" << std::endl;
+    return 4;
+  }
+  module->dump();
+  return 0;
+}
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
new file mode 100644
index 0000000000000..181f462962aee
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
+
+#include <list>
+#include <unordered_set>
+#include <vector>
+#include "llvm/ADT/SetVector.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/IR/Builders.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+
+namespace infrt {
+namespace trt {
+namespace {
+// ReverseDfs
+// do reverse dfs. calls "func" to search when visit a node.
+// The elements in 'source' can't be nullptr.
+// Reference the function nameed "FlexibleDFS" but defined in:
+// paddle/fluid/framework/ir/subgraph_detector.cc.
+
+bool reverseDfs(std::vector<::mlir::Operation *> source,
+                const std::function<bool(const ::mlir::Operation *)> &func) {
+  std::unordered_set<const ::mlir::Operation *> visited;
+  while (!source.empty()) {
+    auto node = source.back();
+    source.pop_back();
+    if (visited.count(node)) continue;
+    visited.insert(node);
+    if (func(node)) return true;
+    auto values = node->getOperands();
+    for (auto value : values) {
+      // if the value is a block argument, the node is nullptr.
+      ::mlir::Operation *node = value.getDefiningOp();
+      if (node != nullptr && !visited.count(node)) {
+        source.emplace_back(node);
+      }
+    }
+  }
+  return false;
+}
+
+// merge the first&second graph op to a new graph op.
+void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
+                             ::mlir::pd::GraphOp first,
+                             ::mlir::pd::GraphOp second) {
+  // comput inputs and outputs
+  ::llvm::SmallVector<::mlir::Value, 4> inputs(first.getOperands()), outputs;
+  for (::mlir::Value input : second.getOperands()) {
+    if (input.getDefiningOp() != first) {
+      inputs.push_back(input);
+    }
+  }
+  ::llvm::DenseMap<::mlir::Value, unsigned int> op_output_mapping;
+  for (::mlir::Value output : first.getResults()) {
+    for (::mlir::Operation *user : output.getUsers()) {
+      if (user != second && user->getParentOp() != second) {
+        op_output_mapping[output] = outputs.size();
+        outputs.push_back(output);
+        break;
+      }
+    }
+  }
+  auto fetch_op = second.getBody()->getTerminator();
+  outputs.append(fetch_op->getOperands().begin(),
+                 fetch_op->getOperands().end());
+  ::llvm::SmallVector<::mlir::Type, 4> fetch_types;
+  for (auto value : outputs) {
+    fetch_types.push_back(value.getType());
+  }
+
+  // create the new graph op
+  builder.setInsertionPoint(first);
+  auto loc = first.getLoc();
+  auto graph_op = builder.create<::mlir::pd::GraphOp>(loc, fetch_types, inputs);
+  ::mlir::Block *block = new ::mlir::Block;
+  auto copy_range = second.getBody()->without_terminator();
+  block->getOperations().splice(block->begin(),
+                                second.getBody()->getOperations(),
+                                copy_range.begin(),
+                                copy_range.end());
+  copy_range = first.getBody()->without_terminator();
+  block->getOperations().splice(block->begin(),
+                                first.getBody()->getOperations(),
+                                copy_range.begin(),
+                                copy_range.end());
+  builder.setInsertionPointToEnd(block);
+  builder.create<mlir::pd::FetchOp>(loc, outputs);
+  graph_op.body().push_back(block);
+
+  // mapping the output
+  unsigned int num_result = first.getNumResults();
+  fetch_op = first.getBody()->getTerminator();
+  for (unsigned int index = 0; index < num_result; ++index) {
+    auto origin_value = first.getResult(index);
+    if (op_output_mapping.find(origin_value) == op_output_mapping.end()) {
+      origin_value.replaceAllUsesWith(fetch_op->getOperand(index));
+    } else {
+      auto inner_value = fetch_op->getOperand(index);
+      auto outer_value = graph_op.getResult(op_output_mapping[origin_value]);
+      while (!origin_value.use_empty()) {
+        auto replace_value =
+            origin_value.use_begin()->getOwner()->getParentOp() == graph_op
+                ? inner_value
+                : outer_value;
+        origin_value.use_begin()->set(replace_value);
+      }
+    }
+  }
+  second.replaceAllUsesWith(
+      graph_op.getResults().take_back(second.getNumResults()));
+  first.erase();
+  second.erase();
+}
+
+// Topological sort the function op.
+void topoSortBlock(mlir::Block &body) {  // NOLINT
+  llvm::SetVector<Operation *> toSort;
+  if (body.empty()) return;
+  for (auto it = body.rbegin(); it != body.rend(); ++it) {
+    toSort.insert(&*it);
+  }
+  llvm::SetVector<Operation *> result =
+      ::mlir::topologicalSort(std::move(toSort));
+  for (auto *op : result) {
+    op->moveBefore(body.getTerminator());
+  }
+}
+
+}  // namespace
+
+// Implementation of the trtGraphFusePass.
+void trtGraphFusePass::runOnFunction() {
+  mlir::Block &body = getFunction().front();
+  ::mlir::OpBuilder builder(&body, body.begin());
+  bool changed = false;
+  do {
+    changed = false;
+    for (auto &op : body) {
+      ::mlir::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op);
+      if (nullptr == graph_op) continue;
+
+      for (auto user_op : op.getUsers()) {
+        ::mlir::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(user_op);
+        if (nullptr == user_graph_op) continue;
+        // get all dst input nodes except src.
+        std::vector<::mlir::Operation *> source_nodes;
+        for (auto operand : user_op->getOperands()) {
+          auto input = operand.getDefiningOp();
+          if (input != &op && input != nullptr) {
+            source_nodes.push_back(input);
+          }
+        }
+        // Reverse DFS from the source_nodes.
+        if (!reverseDfs(source_nodes, [&op](const ::mlir::Operation *n) {
+              return n == &op;
+            })) {
+          mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
+          changed = true;
+          break;
+        }
+      }
+      if (changed) break;
+    }
+  } while (changed);
+  topoSortBlock(body);
+}
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
new file mode 100644
index 0000000000000..e7134e88f316c
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "mlir/Pass/Pass.h"
+
+namespace infrt {
+namespace trt {
+/*
+ * trtGraphFusePass.
+ *
+ * Merge the adjacent graph op to a new graph op.
+ *
+ * source func:
+ *
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %c = "pd.graph"(%a) {
+ *     %m = "pd.conv2d"(%a)...
+ *     "pd.fetch" %m
+ *  } ...
+ *  %d = "pd.graph"(%c) {
+ *      %m = "pd.conv3d"(%c)...
+ *      "pd.fetch" %m
+ *  } ...
+ *  %f = "pd.graph"(%a) {
+ *      %m = "pd.conv2d"(%a)...
+ *      "pd.fetch" %m
+ *  } ...
+ *  "pd.fetch" %d, %f
+ *
+ * destination func:
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %d, %f = "pd.graph"(%a) {
+ *     %m = "pd.conv2d"(%a)...
+ *     %n = "pd.conv3d"(%m)...
+ *     %s = "pd.conv2d"(%a)...
+ *     "pd.fetch" %n, %s
+ *  } ...
+ *  "pd.fetch" %d, %f
+ * }
+ */
+class trtGraphFusePass
+    : public ::mlir::PassWrapper<trtGraphFusePass, ::mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
+  void runOnFunction() override;
+};
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
new file mode 100644
index 0000000000000..2b45364de2036
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
+
+#include "mlir/IR/Builders.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+
+namespace infrt {
+namespace trt {
+// Implementation of the trtGraphSplitPass。
+void trtGraphSplitPass::runOnFunction() {
+  std::vector<::mlir::pd::GraphOp> worklist;
+  ::mlir::Block& block = getFunction().front();
+  for (auto& op : block) {
+    ::mlir::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op);
+    if (nullptr != graph_op &&
+        graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
+      worklist.push_back(graph_op);
+    }
+  }
+  while (!worklist.empty()) {
+    ::mlir::pd::GraphOp graph_op = worklist.back();
+    worklist.pop_back();
+    ::mlir::Block* body = graph_op.getBody();
+    auto fetch_op = body->getTerminator();
+    graph_op.replaceAllUsesWith(fetch_op->getOperands());
+    auto copy_range = body->without_terminator();
+    block.getOperations().splice(::mlir::Block::iterator(graph_op),
+                                 body->getOperations(),
+                                 copy_range.begin(),
+                                 copy_range.end());
+    graph_op.erase();
+  }
+}
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
new file mode 100644
index 0000000000000..092df0cf834e5
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "mlir/Pass/Pass.h"
+
+namespace infrt {
+namespace trt {
+/*
+ * trtGraphSplitPass.
+ *
+ * Splite the graph op when the number of operations is too small.
+ * The feature is the opposite of 'trtOpTellerPass'.
+ *
+ * source func:
+ *
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %d, %f = "pd.graph"(%a) {
+ *     %m = "pd.conv2d"(%a)...
+ *     %n = "pd.conv3d"(%m)...
+ *     %s = "pd.conv2d"(%a)...
+ *     "pd.fetch" %n, %s
+ *  } ...
+ *  "pd.fetch" %d, %f
+ * }
+ *
+ * destination func:
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %c = "pd.conv2d"(%a) ...
+ *  %d = "pd.conv3d"(%c) ...
+ *  %f = "pd.conv2d"(%a) ...
+ *  "pd.fetch" %d, %f
+ * }
+ */
+class trtGraphSplitPass
+    : public ::mlir::PassWrapper<trtGraphSplitPass, ::mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
+  void runOnFunction() override;
+  explicit trtGraphSplitPass(size_t min_subgraph_size = 3)
+      : min_subgraph_size_(min_subgraph_size) {}
+
+ private:
+  size_t min_subgraph_size_;
+};
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_base.td b/paddle/infrt/dialect/tensorrt/trt_op_base.td
new file mode 100755
index 0000000000000..5722f17d59787
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_op_base.td
@@ -0,0 +1,77 @@
+// This file defines some basic elements of Paddle(alias trt) dialect.
+// We learned much from TensorFlow mlir dialect https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+
+#ifndef TRT_OP_BASE
+#define TRT_OP_BASE
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def TRT_Dialect : Dialect {
+  let name = "trt";
+
+  let description = [{
+    The PaddlePaddle dialect.
+
+    This dialect contains the PaddlePaddle operators.
+  }];
+
+  let cppNamespace = "::infrt::trt";
+}
+
+class TRT_Op<string mnemonic, list<OpTrait> traits = []> :
+      Op<TRT_Dialect, mnemonic, traits>;
+
+
+class TRT_PaddleAttr <string name, string description> :
+      Attr<CPred<"$_self.isa<mlir::trt::" # name # "Attr>()">,
+          "PaddlePaddle " # description # " attribute">;
+
+
+//===----------------------------------------------------------------------===//
+// PaddlePaddle type definitions
+//===----------------------------------------------------------------------===//
+
+def TRT_TRTDialectType : Type<CPred<"$_self.isa<mlir::trt::TRTType>()">, "PaddlePaddle type">;
+
+class TRT_PaddleType <string name, string description> :
+      Type<CPred<"$_self.isa<mlir::trt::" # name #"Type>()">,
+         "Paddle " # description # " type">,
+      BuildableType<"getType<mlir::trt::" # name # "Type>()">;
+
+//===----------------------------------------------------------------------===//
+// Integer types
+def TRT_Bool : AnyTypeOf<[I<1>], "bool">;
+def TRT_Int8 : AnyTypeOf<[I8], "8-bit integer">;
+def TRT_Int16 : AnyTypeOf<[I16], "16-bit integer">;
+def TRT_Int32 : AnyTypeOf<[I32], "32-bit integer">;
+def TRT_Int64 : AnyTypeOf<[I64], "64-bit integer">;
+
+def TRT_UInt8 : AnyTypeOf<[UI<8>], "8-bit unsigned integer">;
+def TRT_UInt16 : AnyTypeOf<[UI<16>], "16-bit unsigned integer">;
+def TRT_UInt32 : AnyTypeOf<[UI<32>], "32-bit unsigned integer">;
+def TRT_UInt64 : AnyTypeOf<[UI<64>], "64-bit unsigned integer">;
+
+def TRT_SInt : AnyTypeOf<[TRT_Int8, TRT_Int16, TRT_Int32, TRT_Int64], "signed integer">;
+def TRT_UInt : AnyTypeOf<[TRT_UInt8, TRT_UInt16, TRT_UInt32, TRT_UInt64], "unsigned integer">;
+def TRT_Int : AnyTypeOf<[TRT_SInt, TRT_UInt], "integer">;
+
+// Float types
+def TRT_Float16 : AnyTypeOf<[F16], "16-bit float">;
+def TRT_Float32 : AnyTypeOf<[F32], "32-bit float">;
+def TRT_Float64 : AnyTypeOf<[F64], "64-bit float">;
+
+def TRT_Float : AnyTypeOf<[TRT_Float16, TRT_Float32, TRT_Float64], "floating-point">;
+
+
+// Tensor types
+
+def TRT_ElementType : Type<Or<[TRT_Float.predicate,
+                              TRT_Bool.predicate,
+                              TRT_Int.predicate]>,
+                              "trt.dtype">;
+
+def TRT_Tensor : TensorOf<[TRT_ElementType]>;
+
+
+#endif // TRT_OP_BASE
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
new file mode 100644
index 0000000000000..7b7fbb05c1d13
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
+
+#include "mlir/IR/Builders.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+
+namespace infrt {
+namespace trt {
+// Implementation of the trtOpTellerPass。
+void trtOpTellerPass::runOnFunction() {
+  ::mlir::Block &body = getFunction().front();
+  std::vector<::mlir::Operation *> worklist;
+  worklist.reserve(body.getOperations().size());
+  for (auto &op : body) {
+    worklist.push_back(&op);
+  }
+  // Build GraphOp.
+  ::mlir::OpBuilder builder(&body, body.begin());
+  while (!worklist.empty()) {
+    auto *op = worklist.back();
+    worklist.pop_back();
+    if (op == nullptr) continue;
+    auto op1 = ::llvm::dyn_cast_or_null<::mlir::pd::FeedOp>(op);
+    if (op1) continue;
+    auto op2 = ::llvm::dyn_cast_or_null<::mlir::pd::FetchOp>(op);
+    if (op2) continue;
+    auto op3 = ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(op);
+    if (op3) continue;
+    builder.setInsertionPoint(op);
+    auto loc = getFunction().getLoc();
+    auto graph_op = builder.create<::mlir::pd::GraphOp>(
+        loc, op->getResultTypes(), op->getOperands());
+
+    ::llvm::SmallVector<::mlir::Value, 4> tblgen_repl_values;
+    for (auto v :
+         ::llvm::SmallVector<::mlir::Value, 4>{graph_op.getODSResults(0)}) {
+      tblgen_repl_values.push_back(v);
+    }
+    op->replaceAllUsesWith(tblgen_repl_values);
+    // Build graph op.
+    ::mlir::Block *block = new ::mlir::Block;
+    graph_op.body().push_back(block);
+    op->moveBefore(block, block->begin());
+    builder.setInsertionPointToEnd(block);
+    builder.create<mlir::pd::FetchOp>(loc, op->getResults());
+  }
+}
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
new file mode 100644
index 0000000000000..b03945b3459c0
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "mlir/Pass/Pass.h"
+
+namespace infrt {
+namespace trt {
+/*
+ * trtOpTellerPass.
+ *
+ * Pick out the operators supported by tensorrt and convert it to graph.
+ *
+ * source func:
+ *
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %c = "pd.conv2d"(%a) ...
+ *  %d = "pd.conv3d"(%c) ...
+ *  %f = "pd.conv2d"(%a) ...
+ *  "pd.fetch" %d, %f
+ * }
+ *
+ * destination func:
+ * func @main() -> tensor<?xf32> {
+ *  %a = "pd.feed"()...
+ *  %c = "pd.graph"(%a) {
+ *     %m = "pd.conv2d"(%a)...
+ *     "pd.fetch" %m
+ *  } ...
+ *  %d = "pd.graph"(%c) {
+ *      %m = "pd.conv3d"(%c)...
+ *      "pd.fetch" %m
+ *  } ...
+ *  %f = "pd.graph"(%a) {
+ *      %m = "pd.conv2d"(%a)...
+ *      "pd.fetch" %m
+ *  } ...
+ *  "pd.fetch" %d, %f
+ * }
+ * TODO(winter-wang): Supplementary how to judge the operators can be supported
+ * by tensorrt.
+ */
+class trtOpTellerPass
+    : public ::mlir::PassWrapper<trtOpTellerPass, ::mlir::FunctionPass> {
+ public:
+  ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
+  void runOnFunction() override;
+};
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
new file mode 100644
index 0000000000000..4c02238b10e1d
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace infrt {
+namespace trt {
+
+TensorRTDialect::TensorRTDialect(::mlir::MLIRContext *context)
+    : ::mlir::Dialect("trt", context, ::mlir::TypeID::get<TensorRTDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
+      >();
+#undef GET_OP_LIST
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
+#undef GET_OP_CLASSES
+
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
new file mode 100644
index 0000000000000..c9043c2280de0
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/Dialect/Traits.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace infrt {
+namespace trt {
+
+class TensorRTDialect : public ::mlir::Dialect {
+ public:
+  explicit TensorRTDialect(::mlir::MLIRContext* context);
+  static llvm::StringRef getDialectNamespace() { return "trt"; }
+};
+
+// mlir bug。 can be removed safety when update mlir to llvm11.
+using namespace mlir;  // NOLINT
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensorrt/trt_ops.hpp.inc"
+#undef GET_OP_CLASSES
+
+}  // namespace trt
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td
new file mode 100755
index 0000000000000..cc072b6e6885b
--- /dev/null
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.td
@@ -0,0 +1,30 @@
+#ifndef TRT_OPS
+#define TRT_OPS
+
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
+include "mlir/Interfaces/CallInterfaces.td"
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/tensorrt/trt_op_base.td"
+
+def TRT_FetchOp : TRT_Op<"fetch", [Terminator]> {
+  let summary = "TensorRT engine return operation";
+  let description = [{
+    The `trt.fetch` operation terminates and returns values for the
+    `trt.graph` operation.
+    }];
+
+  let arguments = (ins Variadic<TRT_Tensor>:$inputs);
+}
+
+def TRT_GraphOp : TRT_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
+  let summary = "trt Graph Op";
+  let description = [{
+    Describe a tensorrt subgraph.
+  }];
+  let regions = (region SizedRegion<1>:$body);
+  
+  let results = (outs Variadic<TRT_Tensor>:$outputs);
+
+}
+#endif  // TRT_OPS
diff --git a/paddle/infrt/paddle/CMakeLists.txt b/paddle/infrt/paddle/CMakeLists.txt
index 172d78ecde3b8..21c117535fe70 100644
--- a/paddle/infrt/paddle/CMakeLists.txt
+++ b/paddle/infrt/paddle/CMakeLists.txt
@@ -11,12 +11,6 @@ gather_srcs(infrt_src SRCS
     tensor.cc
     )
 
-foreach(cpp ${SRCS})
-  set(infrt_src
-    "${infrt_src};infrt/paddle/${cpp}"
-    CACHE INTERNAL "")
-endforeach()
-
 file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
 
 foreach(header ${includes})
diff --git a/paddle/infrt/paddle/cpp/CMakeLists.txt b/paddle/infrt/paddle/cpp/CMakeLists.txt
index 0feaabd2fa7c9..8b48603bddf8e 100644
--- a/paddle/infrt/paddle/cpp/CMakeLists.txt
+++ b/paddle/infrt/paddle/cpp/CMakeLists.txt
@@ -1,14 +1,3 @@
-core_gather_headers()
-
-gather_srcs(infrt_src SRCS
-    )
-
-foreach(cpp ${SRCS})
-  set(infrt_src
-    "${infrt_src};infrt/paddle/cpp/${cpp}"
-    CACHE INTERNAL "")
-endforeach()
-
 file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
 
 foreach(header ${includes})
diff --git a/paddle/infrt/paddle/pb/CMakeLists.txt b/paddle/infrt/paddle/pb/CMakeLists.txt
index fac38afa62db2..b3491cfe13618 100644
--- a/paddle/infrt/paddle/pb/CMakeLists.txt
+++ b/paddle/infrt/paddle/pb/CMakeLists.txt
@@ -1,5 +1,3 @@
-core_gather_headers()
-
 gather_srcs(infrt_src SRCS
     var_desc.cc
     op_desc.cc
@@ -7,12 +5,6 @@ gather_srcs(infrt_src SRCS
     program_desc.cc
     )
 
-foreach(cpp ${SRCS})
-  set(infrt_src
-    "${infrt_src};infrt/paddle/pb/${cpp}"
-    CACHE INTERNAL "")
-endforeach()
-
 file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
 
 foreach(header ${includes})
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 799ec885b997d..05b321c50c1c4 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -23,14 +23,10 @@ add_subdirectory(ops)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context)
-set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu conj_kernel_cpu scale_kernel_cpu full_kernel_cpu)
-set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)
-if(WITH_GPU OR WITH_ROCM)
-  set(PTEN_DEPS ${PTEN_DEPS} math_gpu linalg_gpu manipulation_gpu conj_kernel_gpu scale_kernel_gpu full_kernel_gpu)
-endif()
-if(WITH_XPU)
-  set(PTEN_DEPS ${PTEN_DEPS} manipulation_xpu)
-endif()
+set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context infermeta)
+get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+# keep this message for debug, remove it later if needless
+message(STATUS "All standard pten kernels: ${pten_kernels}")
+set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
 
 cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
index b7ef1c1ec2611..7dd517e5e6381 100644
--- a/paddle/pten/all.h
+++ b/paddle/pten/all.h
@@ -16,8 +16,6 @@ limitations under the License. */
 
 // developer apis
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/include/creation.h"
 #include "paddle/pten/include/infermeta.h"
 #include "paddle/pten/include/linalg.h"
-#include "paddle/pten/include/manipulation.h"
 #include "paddle/pten/include/math.h"
diff --git a/paddle/pten/api/ext/dispatch.h b/paddle/pten/api/ext/dispatch.h
index 3b40a39af5300..07d29ef3e140b 100644
--- a/paddle/pten/api/ext/dispatch.h
+++ b/paddle/pten/api/ext/dispatch.h
@@ -159,6 +159,73 @@ namespace paddle {
     }                                                                     \
   }()
 
+///////// Floating and Complex and other type Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_1_TYPES(                      \
+    SPECIFIED_TYPE, TYPE, NAME, ...)                                       \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME,                                                            \
+          SPECIFIED_TYPE,                                                  \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE>::type, \
+          __VA_ARGS__)                                                     \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(NAME,                                           \
+                           ::paddle::DataType::COMPLEX64,                  \
+                           ::paddle::complex64,                            \
+                           __VA_ARGS__)                                    \
+      PD_PRIVATE_CASE_TYPE(NAME,                                           \
+                           ::paddle::DataType::COMPLEX128,                 \
+                           ::paddle::complex128,                           \
+                           __VA_ARGS__)                                    \
+      default:                                                             \
+        PD_THROW("function " #NAME " is not implemented for data type `",  \
+                 __dtype__,                                                \
+                 "`");                                                     \
+    }                                                                      \
+  }()
+
+///////// Floating and Complex and 2 other type Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_2_TYPES(                       \
+    SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, ...)                      \
+  [&] {                                                                     \
+    const auto& __dtype__ = TYPE;                                           \
+    switch (__dtype__) {                                                    \
+      PD_PRIVATE_CASE_TYPE(                                                 \
+          NAME,                                                             \
+          SPECIFIED_TYPE1,                                                  \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE1>::type, \
+          __VA_ARGS__)                                                      \
+      PD_PRIVATE_CASE_TYPE(                                                 \
+          NAME,                                                             \
+          SPECIFIED_TYPE2,                                                  \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE2>::type, \
+          __VA_ARGS__)                                                      \
+      PD_PRIVATE_CASE_TYPE(                                                 \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)            \
+      PD_PRIVATE_CASE_TYPE(                                                 \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(NAME,                                            \
+                           ::paddle::DataType::COMPLEX64,                   \
+                           ::paddle::complex64,                             \
+                           __VA_ARGS__)                                     \
+      PD_PRIVATE_CASE_TYPE(NAME,                                            \
+                           ::paddle::DataType::COMPLEX128,                  \
+                           ::paddle::complex128,                            \
+                           __VA_ARGS__)                                     \
+      default:                                                              \
+        PD_THROW("function " #NAME " is not implemented for data type `",   \
+                 __dtype__,                                                 \
+                 "`");                                                      \
+    }                                                                       \
+  }()
+
 ///////// Floating, Integral and Complex Dispatch Marco ///////////
 
 #define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h
index ebae064c33689..0b17415a6a98d 100644
--- a/paddle/pten/api/include/kernel_signature.h
+++ b/paddle/pten/api/include/kernel_signature.h
@@ -33,8 +33,10 @@ using add_kernel = void (*)(const DeviceContext&,
                             int,
                             DenseTensor*);
 
-using cast_kernel = void (*)(
-    const DeviceContext&, const DenseTensor&, DataType, DataType, DenseTensor*);
+using cast_kernel = void (*)(const DeviceContext&,
+                             const DenseTensor&,
+                             DataType,
+                             DenseTensor*);
 
 using divide_kernel = void (*)(const DeviceContext&,
                                const DenseTensor&,
@@ -50,6 +52,11 @@ using dot_kernel = void (*)(const DeviceContext&,
 using flatten_kernel =
     void (*)(const DeviceContext&, const DenseTensor&, int, int, DenseTensor*);
 
+using empty_kernel = void (*)(const DeviceContext&,
+                              const ScalarArray&,
+                              DenseTensor*);
+
+using empty_like_kernel = void (*)(const DeviceContext&, DenseTensor*);
 using full_kernel = void (*)(const DeviceContext&,
                              const ScalarArray&,
                              const Scalar&,
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index 935c7d8e325d0..b22d2d65a439c 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -204,6 +204,14 @@ class PADDLE_API Tensor final {
    */
   DataLayout layout() const;
 
+  /**
+   * @brief Determine whether tensor is DenseTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_dense_tensor() const;
+
   /* Part 3: Device and Backend methods */
 
   /**
@@ -296,7 +304,7 @@ class PADDLE_API Tensor final {
    *                 The index number begins from begin_idx + 1.
    * @return Tensor
    */
-  Tensor slice(const int64_t begin_idx, const int64_t end_idx) const;
+  Tensor slice(int64_t begin_idx, int64_t end_idx) const;
 
   /**
    * @brief Return the implemention of current Tensor.
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
deleted file mode 100644
index 4dbd46bff65ad..0000000000000
--- a/paddle/pten/api/lib/kernel_declare.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/kernel_registry.h"
-
-// TODO(chenweihang) After the kernel is split into a single file,
-// the kernel declare statement is automatically generated according to the
-// file name of the kernel, and this header file will be removed
-
-PT_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(flatten, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sign, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(full_like, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dot, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(flatten, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sign, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(scale, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(conj, GPU, ALL_LAYOUT);
-#endif
-
-#ifdef PADDLE_WITH_XPU
-PT_DECLARE_KERNEL(flatten, XPU, ALL_LAYOUT);
-#endif
diff --git a/paddle/pten/api/lib/op_meta_info.cc b/paddle/pten/api/lib/op_meta_info.cc
index 586fa0cc05526..aa2e33afb94b8 100644
--- a/paddle/pten/api/lib/op_meta_info.cc
+++ b/paddle/pten/api/lib/op_meta_info.cc
@@ -122,13 +122,6 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
 }
 
 OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
-  PADDLE_ENFORCE_EQ(
-      index_,
-      0UL,
-      platform::errors::Unimplemented(
-          "Currently, the InferShapeFn setting of Grad Op is not supported, "
-          "And backward Tensor `X@GRAD` will use the shape of forward Tensor "
-          "`X` by default."));
   info_ptr_->SetInferShapeFn(std::forward<InferShapeFunc>(func));
   return *this;
 }
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 6ecc46ca8b53f..e5dd1ca5f870d 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -58,15 +58,6 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-namespace detail {
-
-inline bool IsDenseTensor(
-    const std::shared_ptr<pten::TensorBase> &tensor_impl) {
-  return tensor_impl->type_info().name() == "DenseTensor";
-}
-
-}  // namespace detail
-
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
 
@@ -118,7 +109,7 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
                   "reason: `reshape` means changing the tensor shape without "
                   "touching underlying data, this requires the total size of "
                   "the tensor to remain constant.";
-  if (detail::IsDenseTensor(impl_)) {
+  if (is_dense_tensor()) {
     std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->set_meta(
         pten::DenseTensorMeta(dtype(), framework::make_ddim(shape)));
   } else {
@@ -133,6 +124,10 @@ DataType Tensor::type() const { return impl_->dtype(); }
 
 DataLayout Tensor::layout() const { return impl_->layout(); }
 
+bool Tensor::is_dense_tensor() const {
+  return pten::DenseTensor::classof(impl_.get());
+}
+
 /* Part 3: Device and Backend methods */
 
 PlaceType Tensor::place() const {
@@ -153,7 +148,7 @@ bool Tensor::is_cuda() const {
 
 template <typename T>
 T *Tensor::mutable_data() {
-  if (detail::IsDenseTensor(impl_)) {
+  if (is_dense_tensor()) {
     return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)
         ->mutable_data<T>();
   }
@@ -209,7 +204,7 @@ Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
 
 template <typename T>
 const T *Tensor::data() const {
-  if (detail::IsDenseTensor(impl_)) {
+  if (is_dense_tensor()) {
     return std::dynamic_pointer_cast<pten::DenseTensor>(impl_)->data<T>();
   }
   return nullptr;
@@ -258,11 +253,11 @@ template PADDLE_API paddle::platform::float16 *
 Tensor::data<paddle::platform::float16>();
 
 // TODO(chenweihang): replace slice impl by API
-Tensor Tensor::slice(const int64_t begin_idx, const int64_t end_idx) const {
-  if (detail::IsDenseTensor(impl_)) {
+Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
+  if (is_dense_tensor()) {
     return Tensor(std::make_shared<pten::DenseTensor>(
         std::move(pten::CompatibleDenseTensorUtils::Slice(
-            std::dynamic_pointer_cast<pten::DenseTensor>(impl_).get(),
+            *(std::dynamic_pointer_cast<pten::DenseTensor>(impl_).get()),
             begin_idx,
             end_idx))));
   } else {
diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h
index 8a8569c73edae..4f5a810e400ce 100644
--- a/paddle/pten/api/lib/utils/allocator.h
+++ b/paddle/pten/api/lib/utils/allocator.h
@@ -28,8 +28,8 @@ class DefaultAllocator : public pten::Allocator {
   explicit DefaultAllocator(const paddle::platform::Place& place)
       : place_(place) {}
 
-  static void Delete(void* data) {
-    deleter_(static_cast<paddle::memory::Allocation*>(data));
+  static void Delete(Allocation* allocation) {
+    deleter_(allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
   }
 
   Allocation Allocate(size_t bytes_size) override {
@@ -38,6 +38,8 @@ class DefaultAllocator : public pten::Allocator {
     return Allocation(ptr, a.release(), &Delete, place_);
   }
 
+  const paddle::platform::Place& place() override { return place_; }
+
  private:
   paddle::platform::Place place_;
   static paddle::memory::Allocator::AllocationDeleter deleter_;
diff --git a/paddle/pten/api/lib/utils/storage.cc b/paddle/pten/api/lib/utils/storage.cc
index ba26e7f600d60..9ee1b9e5b7f92 100644
--- a/paddle/pten/api/lib/utils/storage.cc
+++ b/paddle/pten/api/lib/utils/storage.cc
@@ -20,13 +20,15 @@ namespace experimental {
 ExternalStorage::ExternalStorage(void* ptr,
                                  size_t size,
                                  const paddle::platform::Place& place)
-    : pten::Storage(pten::Allocation(ptr, place)), size_(size) {}
+    : pten::Storage(
+          std::make_shared<paddle::memory::Allocation>(ptr, size, place)),
+      size_(size) {}
 
 ExternalStorage::ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
                                  size_t delta,
                                  size_t size)
-    : Storage(pten::Allocation(static_cast<uint8_t*>(root->data()) + delta,
-                               root->place())),
+    : Storage(std::make_shared<paddle::memory::Allocation>(
+          static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
       size_(size) {
   PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
                     root->size(),
diff --git a/paddle/pten/api/lib/utils/storage.h b/paddle/pten/api/lib/utils/storage.h
index e98c5a82feddd..e102ecbc5de7d 100644
--- a/paddle/pten/api/lib/utils/storage.h
+++ b/paddle/pten/api/lib/utils/storage.h
@@ -35,13 +35,17 @@ class ExternalStorage : public pten::Storage {
   }
 
   void Clear() override {
-    data_.Clear();
+    data_ = nullptr;
     size_ = 0;
   }
 
   size_t size() const noexcept override { return size_; }
   const paddle::platform::Place& place() const override {
-    return data_.place();
+    PADDLE_ENFORCE_NOT_NULL(
+        data_,
+        paddle::platform::errors::Unavailable(
+            "Unable to visit place as data_ has not been initialized yet."));
+    return data_->place();
   }
   bool OwnsMemory() const noexcept override { return false; }
 
@@ -52,14 +56,10 @@ class ExternalStorage : public pten::Storage {
 class SharedStorage : public pten::Storage {
  public:
   explicit SharedStorage(
-      const std::shared_ptr<paddle::memory::Allocation>& allocation,
-      size_t offset)
-      : allocation_(allocation) {
+      const std::shared_ptr<paddle::memory::Allocation>& allocation)
+      : Storage(allocation) {
     CHECK(allocation);
-    data_ = pten::Allocation(
-        reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(allocation->ptr()) +
-                                offset),
-        allocation->place());
+    place_ = allocation->place();
     size_ = allocation->size();
   }
 
@@ -67,61 +67,61 @@ class SharedStorage : public pten::Storage {
   // system, we need to allow the uninitialized SharedStorage to exist,
   // and it can be removed after the compatibility phase is over in the future
   explicit SharedStorage(const paddle::platform::Place& place) {
-    data_ = pten::Allocation(nullptr, place);
+    place_ = place;
   }
 
-  static const char* name() { return "SharedStorage"; }
-
-  // In order to be compatible with the original Tensor design and execution
-  // system, we need to allow the SharedStorage realloc,
-  // and it can be removed after the compatibility phase is over in the future
   void Realloc(size_t n) override {
-    ResetAllocation(paddle::memory::AllocShared(place(), n), 0);
+    this->Clear();
+    data_ = paddle::memory::AllocShared(place(), n);
+    size_ = n;
   }
 
+  static const char* name() { return "SharedStorage"; }
+
   void Clear() override {
-    data_.Clear();
+    data_ = nullptr;
     size_ = 0;
   }
 
-  size_t size() const noexcept override { return size_; }
+  void set_data_shared(
+      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
+    data_ = holder;
+    if (holder) {
+      size_ = holder->size();
+      place_ = holder->place();
+    }
+  }
+
+  size_t size() const noexcept override {
+    return data_ ? data_->size() : size_;
+  }
   const paddle::platform::Place& place() const override {
-    return data_.place();
+    return data_ ? data_->place() : place_;
   }
   bool OwnsMemory() const noexcept override { return false; }
 
   const std::shared_ptr<paddle::memory::Allocation>& GetAllocation() {
-    return allocation_;
+    return data_;
   }
 
   // Temporary method: For compatible with fluid Tensor and improve performance
-  void ResetAllocation(std::shared_ptr<paddle::memory::Allocation> allocation,
-                       size_t offset) {
-    allocation_ = allocation;
-    data_ = pten::Allocation(
-        reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(allocation->ptr()) +
-                                offset),
-        allocation->place());
+  void ResetAllocation(std::shared_ptr<paddle::memory::Allocation> allocation) {
+    data_ = allocation;
     size_ = allocation->size();
+    place_ = allocation->place();
   }
 
   // Temporary method: For compatible with fluid Tensor and improve performance
   void ResetAllocationPlace(const paddle::platform::Place& place) {
-    data_ = pten::Allocation(nullptr, place);
+    place_ = place;
   }
 
   // Temporary method: For compatible with fluid Tensor and improve performance
-  void Reset() {
-    if (allocation_ != nullptr) {
-      allocation_.reset();
-    }
-    data_.Clear();
-    size_ = 0;
-  }
+  void Reset() { this->Clear(); }
 
  private:
+  Place place_;
   int64_t size_{0};
-  std::shared_ptr<paddle::memory::Allocation> allocation_;
 };
 
 class TensorStorage : public paddle::memory::allocation::Allocation {
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index b248cd209899b..69a1fc274a28d 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -33,39 +33,33 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
 
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
     const paddle::framework::Tensor& src) {
+  VLOG(3) << "MakePtenDenseTensor based Tensor.";
   pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
                              src.dims(),
-                             pten::TransToPtenDataLayout(src.layout())};
-  auto shared_storage =
-      pten::make_intrusive<SharedStorage>(src.Holder(), src.offset());
+                             src.layout(),
+                             src.offset()};
+  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
   return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
                                              std::move(meta));
 }
 
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
     const paddle::framework::LoDTensor& src) {
-  pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
-                             src.dims(),
-                             pten::TransToPtenDataLayout(src.layout())};
-  SetLoD(&meta.lod, src.lod());
-  auto shared_storage =
-      pten::make_intrusive<SharedStorage>(src.Holder(), src.offset());
-
-  return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
-                                             std::move(meta));
+  auto out =
+      MakePtenDenseTensor(static_cast<const paddle::framework::Tensor&>(src));
+  SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod),
+         src.lod());
+  return std::move(out);
 }
 
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
-    const paddle::framework::Tensor& tensor,
-    const pten::TensorArgDef& arg_def) {
-  pten::DenseTensorMeta meta{arg_def.dtype,
-                             tensor.dims(),
-                             pten::TransToPtenDataLayout(tensor.layout())};
-
-  if (tensor.IsInitialized() &&
-      tensor.place() == pten::TransToFluidPlace(arg_def.backend)) {
-    auto shared_storage =
-        pten::make_intrusive<SharedStorage>(tensor.Holder(), tensor.offset());
+    const paddle::framework::Tensor& src, const pten::TensorArgDef& arg_def) {
+  pten::DenseTensorMeta meta{
+      arg_def.dtype, src.dims(), src.layout(), src.offset()};
+
+  if (src.IsInitialized() &&
+      src.place() == pten::TransToFluidPlace(arg_def.backend)) {
+    auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
     return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
                                                std::move(meta));
   } else {
@@ -77,25 +71,13 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
 }
 
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
-    const paddle::framework::LoDTensor& tensor,
+    const paddle::framework::LoDTensor& src,
     const pten::TensorArgDef& arg_def) {
-  pten::DenseTensorMeta meta{arg_def.dtype,
-                             tensor.dims(),
-                             pten::TransToPtenDataLayout(tensor.layout()),
-                             pten::TransToPtenLoD(tensor.lod())};
-
-  if (tensor.IsInitialized() &&
-      tensor.place() == pten::TransToFluidPlace(arg_def.backend)) {
-    auto shared_storage =
-        pten::make_intrusive<SharedStorage>(tensor.Holder(), tensor.offset());
-    return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
-                                               std::move(meta));
-  } else {
-    return std::make_unique<pten::DenseTensor>(
-        std::move(pten::make_intrusive<SharedStorage>(
-            pten::TransToFluidPlace(arg_def.backend))),
-        std::move(meta));
-  }
+  auto out = MakePtenDenseTensor(
+      static_cast<const paddle::framework::Tensor&>(src), arg_def);
+  SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod),
+         src.lod());
+  return std::move(out);
 }
 
 pten::Scalar MakePtenScalar(const paddle::framework::LoDTensor& src) {
@@ -328,23 +310,15 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   std::shared_ptr<paddle::memory::allocation::Allocation> holder(
       new TensorStorage(std::move(storage)));
   dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype()));
+  dst->set_offset(src->meta().offset);
 }
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
-  PADDLE_ENFORCE_NOT_NULL(
-      src,
-      platform::errors::InvalidArgument(
-          "The source DenseTensor is nullptr when move storage."));
-  PADDLE_ENFORCE_NOT_NULL(
-      dst,
-      platform::errors::InvalidArgument(
-          "The destination LoDTensor is nullptr when move storage."));
-  SetLoD(dst->mutable_lod(), src->lod());
   MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+  SetLoD(dst->mutable_lod(), src->lod());
 }
 
-void MovesSharedStorage(pten::DenseTensor* src,
-                        paddle::framework::Tensor* dst) {
+void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   PADDLE_ENFORCE_NOT_NULL(
       src,
       platform::errors::InvalidArgument(
@@ -358,24 +332,22 @@ void MovesSharedStorage(pten::DenseTensor* src,
       pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
   dst->ResetHolderWithType(storage->GetAllocation(),
                            pten::TransToProtoVarType(src->dtype()));
+  dst->set_offset(src->meta().offset);
 }
 
-void MovesSharedStorage(pten::DenseTensor* src,
-                        paddle::framework::LoDTensor* dst) {
-  MovesSharedStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
+  SharesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
   SetLoD(dst->mutable_lod(), src->lod());
 }
 
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
-                           const pten::TensorArgDef& arg_def,
                            pten::DenseTensor* dst) {
+  VLOG(3) << "ReMakePtenDenseTensor based Tensor.";
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   meta->dims = src.dims();
-  // Since the type of DenseTensorMeta is const, const_cast must be used
-  const_cast<DataType&>(meta->dtype) = arg_def.dtype;
-  // Since the type of DenseTensorMeta is const, const_cast must be used
-  const_cast<DataLayout&>(meta->layout) =
-      pten::TransToPtenDataLayout(src.layout());
+  meta->dtype = pten::TransToPtenDataType(src.type());
+  meta->layout = src.layout();
+  meta->offset = src.offset();
 
   auto* shared_storage = static_cast<SharedStorage*>(
       pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
@@ -384,44 +356,30 @@ void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
       platform::errors::NotFound(
           "Target DenseTensor's shared storage is nullptr."));
 
-  if (src.IsInitialized()) {
-    shared_storage->ResetAllocation(src.Holder(), src.offset());
-  }
+  PADDLE_ENFORCE_EQ(src.IsInitialized(),
+                    true,
+                    paddle::platform::errors::InvalidArgument(
+                        "Source Tensor is not initialized."));
+  shared_storage->ResetAllocation(src.Holder());
 }
 
 void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
                            pten::DenseTensor* dst) {
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
-  meta->dims = src.dims();
-  // Since the type of DenseTensorMeta is const, const_cast must be used
-  const_cast<DataType&>(meta->dtype) = pten::TransToPtenDataType(src.type());
-  // Since the type of DenseTensorMeta is const, const_cast must be used
-  const_cast<DataLayout&>(meta->layout) =
-      pten::TransToPtenDataLayout(src.layout());
-
-  auto* shared_storage = static_cast<SharedStorage*>(
-      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
-  PADDLE_ENFORCE_NOT_NULL(
-      shared_storage,
-      platform::errors::NotFound(
-          "Target DenseTensor's shared storage is nullptr."));
-
-  if (src.IsInitialized()) {
-    shared_storage->ResetAllocation(src.Holder(), src.offset());
-  }
+  SetLoD(&meta->lod, src.lod());
+  ReMakePtenDenseTensor(static_cast<const paddle::framework::Tensor&>(src),
+                        dst);
 }
 
-void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
-                           const pten::TensorArgDef& arg_def,
-                           pten::DenseTensor* dst) {
+void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
+                                   const pten::TensorArgDef& arg_def,
+                                   pten::DenseTensor* dst) {
+  VLOG(3) << "ReMakePtenDenseTensor based Tensor and TensorArgDef.";
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   meta->dims = src.dims();
-  // Since the type of DenseTensorMeta is const, const_cast must be used
-  const_cast<DataType&>(meta->dtype) = arg_def.dtype;
-  // Since the type of DenseTensorMeta is const, const_cast must be used
-  const_cast<DataLayout&>(meta->layout) =
-      pten::TransToPtenDataLayout(src.layout());
-  SetLoD(&(meta->lod), src.lod());
+  meta->dtype = arg_def.dtype;
+  meta->layout = src.layout();
+  meta->offset = src.offset();
 
   auto* shared_storage = static_cast<SharedStorage*>(
       pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
@@ -429,15 +387,25 @@ void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
       shared_storage,
       platform::errors::NotFound(
           "Target DenseTensor's shared storage is nullptr."));
+
   if (src.IsInitialized() &&
       src.place() == pten::TransToFluidPlace(arg_def.backend)) {
-    shared_storage->ResetAllocation(src.Holder(), src.offset());
+    shared_storage->ResetAllocation(src.Holder());
   } else {
     shared_storage->ResetAllocationPlace(
         pten::TransToFluidPlace(arg_def.backend));
   }
 }
 
+void ReMakePtenDenseTensorByArgDef(const paddle::framework::LoDTensor& src,
+                                   const pten::TensorArgDef& arg_def,
+                                   pten::DenseTensor* dst) {
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  SetLoD(&meta->lod, src.lod());
+  ReMakePtenDenseTensorByArgDef(
+      static_cast<const paddle::framework::Tensor&>(src), arg_def, dst);
+}
+
 void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
                                   const pten::TensorArgDef& arg_def,
                                   pten::DenseTensor* dst) {
@@ -453,9 +421,9 @@ void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
     if (!platform::is_same_place(tensor.place(), expected_place)) {
       framework::LoDTensor tmp_tensor;
       framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-      ReMakePtenDenseTensor(tmp_tensor, arg_def, dst);
+      ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
     } else {
-      ReMakePtenDenseTensor(tensor, arg_def, dst);
+      ReMakePtenDenseTensorByArgDef(tensor, arg_def, dst);
     }
   } else if (variable.IsType<framework::SelectedRows>()) {
     // TODO(chenweihang): now we don't deal with row and height
@@ -470,9 +438,9 @@ void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
       framework::Tensor tmp_tensor;
       TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
       // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-      ReMakePtenDenseTensor(tmp_tensor, arg_def, dst);
+      ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
     } else {
-      ReMakePtenDenseTensor(tensor.value(), arg_def, dst);
+      ReMakePtenDenseTensorByArgDef(tensor.value(), arg_def, dst);
     }
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -488,12 +456,12 @@ void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
   // KernelContext to original tensor
   if (variable->template IsType<framework::LoDTensor>()) {
     auto* tensor = variable->template GetMutable<framework::LoDTensor>();
-    ReMakePtenDenseTensor(*tensor, arg_def, dst);
+    ReMakePtenDenseTensorByArgDef(*tensor, arg_def, dst);
   } else if (variable->template IsType<framework::SelectedRows>()) {
     auto* tensor = variable->template GetMutable<framework::SelectedRows>();
     // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
     // here the row and height will lost in output!
-    ReMakePtenDenseTensor(tensor->value(), arg_def, dst);
+    ReMakePtenDenseTensorByArgDef(tensor->value(), arg_def, dst);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Unsupported shared output `%s` type now when call pt kernel.",
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 32b7c377ebfde..06edb4a7516b0 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -58,10 +58,9 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
 
-void MovesSharedStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
+void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
-void MovesSharedStorage(pten::DenseTensor* src,
-                        paddle::framework::LoDTensor* dst);
+void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
 
 /**
  * In order to improve the compatibility state performance, some tricky tool
@@ -72,17 +71,20 @@ void MovesSharedStorage(pten::DenseTensor* src,
  * the overhead caused by frequent construction and destruction of the
  * DenseTensor.
  */
-void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
-                           pten::DenseTensor* dst);
-
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
-                           const pten::TensorArgDef& arg_def,
                            pten::DenseTensor* dst);
 
 void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
-                           const pten::TensorArgDef& arg_def,
                            pten::DenseTensor* dst);
 
+void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
+                                   const pten::TensorArgDef& arg_def,
+                                   pten::DenseTensor* dst);
+
+void ReMakePtenDenseTensorByArgDef(const paddle::framework::LoDTensor& src,
+                                   const pten::TensorArgDef& arg_def,
+                                   pten::DenseTensor* dst);
+
 void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
                                   const pten::TensorArgDef& arg_def,
                                   pten::DenseTensor* dst);
diff --git a/paddle/pten/backends/all_context.h b/paddle/pten/backends/all_context.h
index a7cb4abc2f242..8cc07d216c02c 100644
--- a/paddle/pten/backends/all_context.h
+++ b/paddle/pten/backends/all_context.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/backends/npu/npu_context.h"
 #include "paddle/pten/backends/xpu/xpu_context.h"
 
 namespace pten {
diff --git a/paddle/pten/common/device.cc b/paddle/pten/common/device.cc
index 9583b521d9123..55130067ae200 100644
--- a/paddle/pten/common/device.cc
+++ b/paddle/pten/common/device.cc
@@ -24,7 +24,7 @@ const char* DeviceTypeStr(DeviceType type) {
     case DeviceType::kUndef:
       return "kUndef";
     case DeviceType::kHost:
-      return "kUndef";
+      return "kHost";
     case DeviceType::kXpu:
       return "kXpu";
     case DeviceType::kCuda:
diff --git a/paddle/pten/common/layout.h b/paddle/pten/common/layout.h
index b7c151e7e6a7c..cfcc4f76693d9 100644
--- a/paddle/pten/common/layout.h
+++ b/paddle/pten/common/layout.h
@@ -18,6 +18,8 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
+// Note: Here the DataLayout is public api for external users, the prefix `k`
+// maybe confuse users, so we use all uppercase names
 enum class DataLayout {
   UNDEFINED = 0,
   // TODO(chenweihang): keep ANY for compatibility, remove it later
@@ -26,28 +28,67 @@ enum class DataLayout {
   NCHW,
   MKLDNN,
   NUM_DATA_LAYOUTS,
-  // See Note [ Why we need ALL in baisc kernel key member? ]
+  // See Note [ Why we need ALL in basic kernel key member? ]
   ALL_LAYOUT = UNDEFINED,
+  // Note: Unify pten DataLayout and fluid::framework::DataLayout,
+  // for compatible with fluid DataLayout, here need prefix `k`
+  // Note: The original `kAnyLayout (enum value 2)` is a strange design.
+  // `kAnyLayout` originally cannot represent any kind of Layout,
+  // at the same time, it can also represent any Layout.
+  // Strictly, it means "default" or "undefined" layout,
+  // and should not be mixed with other meaningful layouts.
+  kAnyLayout = ANY,
+  kNHWC = NHWC,
+  kNCHW = NCHW,
+  kMKLDNN = MKLDNN,  // all layouts supported by MKLDNN internally
 };
 
-inline std::ostream& operator<<(std::ostream& os, DataLayout layout) {
+}  // namespace experimental
+
+// In order to be compatible with the fluid implementation
+namespace framework {
+
+using DataLayout = paddle::experimental::DataLayout;
+
+inline DataLayout StringToDataLayout(const std::string& str) {
+  std::string s(str);
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
+
+  if (s == "NHWC") {
+    return DataLayout::kNHWC;
+  } else if (s == "NCHW") {
+    return DataLayout::kNCHW;
+  } else if (s == "ANYLAYOUT") {
+    return DataLayout::kAnyLayout;
+  } else if (s == "MKLDNNLAYOUT") {
+    return DataLayout::kMKLDNN;
+  } else {
+    PD_THROW("Unknown data layout type string: ", s, ".");
+  }
+}
+
+inline std::string DataLayoutToString(const DataLayout& layout) {
   switch (layout) {
-    case DataLayout::UNDEFINED:
-      os << "Undefined";
-      break;
-    case DataLayout::NHWC:
-      os << "NHWC";
-      break;
-    case DataLayout::NCHW:
-      os << "NCHW";
-      break;
-    case DataLayout::MKLDNN:
-      os << "MKLDNN";
-      break;
+    case DataLayout::kNHWC:
+      return "NHWC";
+    case DataLayout::kNCHW:
+      return "NCHW";
+    case DataLayout::kAnyLayout:
+      return "Undefined(AnyLayout)";
+    case DataLayout::kMKLDNN:
+      return "MKLDNN";
     default:
-      PD_THROW(
-          "Invalid enum data layout type `", static_cast<int>(layout), "`.");
+      PD_THROW("Unknown Data Layout type ", static_cast<int>(layout), ".");
   }
+}
+}  // namespace framework
+
+namespace experimental {
+
+inline std::ostream& operator<<(std::ostream& os, DataLayout layout) {
+  os << framework::DataLayoutToString(layout);
   return os;
 }
 
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index 63cae6cc70867..87c3612e35424 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -8,7 +8,12 @@ endif()
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce convert_utils)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce pten_context)
-
 cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
-cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce)
-cc_library(dense_tensor SRCS dense_tensor.cc DEPS tensor_meta tensor_base)
+
+cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector)
+cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base)
+
+# Will remove once we implemented MKLDNN_Tensor
+if(WITH_MKLDNN)
+    add_dependencies(dense_tensor mkldnn)
+endif()
diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h
index 9c6f749609a48..74455be136834 100644
--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -55,29 +55,47 @@ class RawAllocator {
 class Allocation final {
  public:
   using Place = paddle::platform::Place;
-  using DeleterFnPtr = void (*)(void*);
+  using DeleterFnPtr = void (*)(Allocation*);
 
   Allocation() = default;
-  Allocation(Allocation&&) = default;
-  Allocation& operator=(Allocation&&) = default;
 
+  // Don't own resources, only provide access.
   Allocation(void* data, const Place& place) : data_(data), place_(place) {}
 
-  Allocation(void* data,
-             void* ctx,
-             DeleterFnPtr ctx_deleter,
-             const Place& place)
-      : data_(data), ctx_(ctx, ctx_deleter), place_(place) {}
+  // Own resources.
+  Allocation(void* data, void* ctx, DeleterFnPtr deleter, const Place& place)
+      : data_(data), ctx_(ctx), deleter_(deleter), place_(place) {}
 
+  Allocation(Allocation&& other) { swap(*this, other); }
+  Allocation& operator=(Allocation&& other) {
+    // Exchange them explicitly to avoid moving is equivalent
+    // to copying.
+    swap(*this, other);
+    return *this;
+  }
+  ~Allocation() { Clear(); }
+
+  void* ptr() const noexcept { return data_; }
   void* operator->() const noexcept { return data_; }
-  operator bool() const noexcept { return data_ || ctx_.Get(); }
+  operator bool() const noexcept { return data_ || ctx_; }
   const Place& place() const noexcept { return place_; }
 
   void Clear() {
-    ctx_.Clear();
+    if (deleter_) {
+      deleter_(this);
+    }
+    ctx_ = nullptr;
+    deleter_ = nullptr;
     data_ = nullptr;
   }
 
+  DeleterFnPtr deleter() const noexcept { return deleter_; }
+
+  template <typename T>
+  T* CastContextWithoutCheck() const noexcept {
+    return static_cast<T*>(ctx_);
+  }
+
   /// \brief Statically cast the void pointer of the context object to
   /// the primitive type. Conversion of any pointer to void* and back
   /// to pointer to the original cv type preserves its original value.
@@ -85,69 +103,43 @@ class Allocation final {
   /// \param expected_deleter The destructor passed in to enhance type
   /// safety checking.
   template <typename T>
-  T* CastContext(DeleterFnPtr expected_deleter) const noexcept {
-    if (ctx_.deleter() != expected_deleter) {
-      return nullptr;
-    }
-    return static_cast<T*>(ctx_.Get());
+  T* CastContext(DeleterFnPtr expected_deleter) const {
+    PADDLE_ENFORCE_EQ(
+        deleter_ == expected_deleter,
+        true,
+        paddle::platform::errors::InvalidArgument(
+            "The deleter of the allocation does not match, so the pointer "
+            "cannot be safely removed."));
+    return CastContextWithoutCheck<T>();
   }
 
- public:
-  class Context {
-   public:
-    Context() = default;
-    Context(void* ctx, DeleterFnPtr deleter) noexcept : ctx_(ctx),
-                                                        deleter_(deleter) {}
-    Context(Context&& other) noexcept {
-      // Exchange them explicitly to avoid moving is equivalent
-      // to copying.
-      swap(*this, other);
-    }
-    Context& operator=(Context&& other) noexcept {
-      swap(*this, other);
-      return *this;
-    }
-    ~Context() { Clear(); }
-    void Clear() {
-      if (deleter_) {
-        deleter_(ctx_);
-      }
-      ctx_ = nullptr;
-      deleter_ = nullptr;
-    }
-    void* Get() const noexcept { return ctx_; }
-    DeleterFnPtr deleter() const noexcept { return deleter_; }
-    void* Release() noexcept {
-      deleter_ = nullptr;
-      return ctx_;
-    }
-    friend void swap(Context& a, Context& b) noexcept;
-
-   private:
-    void* ctx_{nullptr};
-    DeleterFnPtr deleter_{nullptr};
-  };
-
  private:
+  friend void swap(Allocation& a, Allocation& b) noexcept;
   void* data_{nullptr};
-  Context ctx_;
+  void* ctx_{nullptr};
+  DeleterFnPtr deleter_{nullptr};
   // TODO(Shixiaowei02): Enum needs to be used instead to reduce
   // the construction overhead by more than 50%.
   Place place_;
 };
 
-inline void swap(Allocation::Context& a, Allocation::Context& b) noexcept {
+inline void swap(Allocation& a, Allocation& b) noexcept {
+  ::std::swap(a.data_, b.data_);
   ::std::swap(a.ctx_, b.ctx_);
   ::std::swap(a.deleter_, b.deleter_);
+  ::std::swap(a.place_, b.place_);
 }
 
 /// \brief Context compatible allocator interface. This allocator is
 /// mainly used for general data structures such as Tensor. The raw
 /// allocator is more universal and efficient.
 class Allocator {
+  using Place = paddle::platform::Place;
+
  public:
   virtual ~Allocator() = default;
   virtual Allocation Allocate(size_t bytes_size) = 0;
+  virtual const Place& place() = 0;
 };
 
 inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
diff --git a/paddle/pten/core/compat_utils.h b/paddle/pten/core/compat_utils.h
index c61b96546ec63..0bd82080ddebc 100644
--- a/paddle/pten/core/compat_utils.h
+++ b/paddle/pten/core/compat_utils.h
@@ -48,16 +48,16 @@ class CompatibleDenseTensorUtils {
     }
   }
 
-  static DenseTensor Slice(DenseTensor* tensor,
+  static DenseTensor Slice(const DenseTensor& tensor,
                            int64_t begin_idx,
                            int64_t end_idx) {
-    size_t bytes = tensor->numel() * SizeOf(tensor->dtype());
-    PADDLE_ENFORCE_GE(tensor->capacity(),
+    size_t bytes = tensor.numel() * SizeOf(tensor.dtype());
+    PADDLE_ENFORCE_GE(tensor.capacity(),
                       bytes,
                       paddle::platform::errors::InvalidArgument(
                           "The memory size %d should be enough to meet the "
                           "volume required by metadata %d.",
-                          tensor->capacity(),
+                          tensor.capacity(),
                           bytes));
     PADDLE_ENFORCE_GE(begin_idx,
                       0,
@@ -66,7 +66,7 @@ class CompatibleDenseTensorUtils {
                           "But received the start index is d%.",
                           begin_idx));
     PADDLE_ENFORCE_LE(end_idx,
-                      tensor->dims()[0],
+                      tensor.dims()[0],
                       paddle::platform::errors::OutOfRange(
                           "The end row index is out of bound."));
     PADDLE_ENFORCE_LT(
@@ -77,13 +77,12 @@ class CompatibleDenseTensorUtils {
             "But received the start index = %d, the end index = %d.",
             begin_idx,
             end_idx));
-    DenseTensor ret =
-        DenseTensor(copy_intrusive(tensor->storage_), tensor->meta_);
-    if (tensor->dims()[0] != 1) {
+    DenseTensor ret(tensor);
+    if (tensor.dims()[0] != 1) {
       ret.meta_.dims[0] = end_idx - begin_idx;
-      ret.meta_.offset = tensor->meta_.offset +
-                         begin_idx * (tensor->numel() / tensor->dims()[0]) *
-                             paddle::experimental::SizeOf(tensor->dtype());
+      ret.meta_.offset = tensor.meta_.offset +
+                         begin_idx * (tensor.numel() / tensor.dims()[0]) *
+                             paddle::experimental::SizeOf(tensor.dtype());
     }
     return ret;
   }
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
index bb8b41612868d..70184e31f7db6 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -63,21 +63,6 @@ paddle::experimental::DataType TransToPtenDataType(
   }
 }
 
-DataLayout TransToPtenDataLayout(const paddle::framework::DataLayout& layout) {
-  switch (layout) {
-    case paddle::framework::DataLayout::kNHWC:
-      return DataLayout::NHWC;
-    case paddle::framework::DataLayout::kNCHW:
-      return DataLayout::NCHW;
-    case paddle::framework::DataLayout::kAnyLayout:
-      return DataLayout::ANY;
-    case paddle::framework::DataLayout::kMKLDNN:
-      return DataLayout::MKLDNN;
-    default:
-      return DataLayout::UNDEFINED;
-  }
-}
-
 paddle::platform::Place TransToFluidPlace(const Backend& backend) {
   // TODO(chenweihang): add other trans cases later
   switch (backend) {
@@ -141,24 +126,6 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
   }
 }
 
-paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout) {
-  switch (layout) {
-    case DataLayout::NHWC:
-      return paddle::framework::DataLayout::kNHWC;
-    case DataLayout::NCHW:
-      return paddle::framework::DataLayout::kNCHW;
-    case DataLayout::ANY:
-      return paddle::framework::DataLayout::kAnyLayout;
-    case DataLayout::MKLDNN:
-      return paddle::framework::DataLayout::kMKLDNN;
-    default:
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported data layout `%s` when casting it into "
-          "paddle data layout.",
-          layout));
-  }
-}
-
 paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod) {
   paddle::framework::LoD out;
   out.reserve(lod.size());
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/convert_utils.h
index 49c905a84ed48..9e33d37c4a8e8 100644
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/pten/core/tensor_meta.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/place.h"
@@ -37,12 +36,10 @@ const std::string& TransToPtenKernelName(const std::string& fluid_op_name);
 Backend TransToPtenBackend(const paddle::platform::Place& place);
 DataType TransToPtenDataType(
     const paddle::framework::proto::VarType::Type& dtype);
-DataLayout TransToPtenDataLayout(const paddle::framework::DataLayout& layout);
 
 paddle::platform::Place TransToFluidPlace(const Backend& backend);
 paddle::framework::proto::VarType::Type TransToProtoVarType(
     const DataType& dtype);
-paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout);
 
 paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod);
 pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod);
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 3237576cb6437..1b4254ad2c103 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/convert_utils.h"
+
 namespace pten {
 
 DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
@@ -38,6 +41,35 @@ DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
 DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
     : meta_(std::move(meta)), storage_(std::move(storage)) {}
 
+DenseTensor::DenseTensor(const DenseTensor& other) : meta_(other.meta()) {
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+        paddle::platform::CPUPlace());
+  }
+  if (other.storage_ != nullptr && other.storage_->data_shared()) {
+    storage_->set_data_shared(other.storage_->data_shared());
+  }
+
+#ifdef PADDLE_WITH_MKLDNN
+  format_ = other.format_;
+#endif
+}
+
+DenseTensor& DenseTensor::operator=(const DenseTensor& other) {
+  meta_ = other.meta();
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+        paddle::platform::CPUPlace());
+  }
+  if (other.storage_ != nullptr && other.storage_->data_shared()) {
+    storage_->set_data_shared(other.storage_->data_shared());
+  }
+#ifdef PADDLE_WITH_MKLDNN
+  format_ = other.format_;
+#endif
+  return *this;
+}
+
 int64_t DenseTensor::numel() const {
   if (meta_.is_scalar) {
     return 1;
@@ -69,12 +101,14 @@ void* DenseTensor::mutable_data(size_t request_bytes) {
                           bytes));
     bytes = request_bytes;
   }
-  if (storage_->size() < bytes || storage_->size() == 0) {
+  if (storage_->size() < bytes + meta_.offset || storage_->size() == 0) {
     VLOG(10) << "mutbale data realloc, original size: " << storage_->size()
              << ", new size: " << bytes;
     storage_->Realloc(bytes);
+    meta_.offset = 0;
   }
-  return storage_->data();
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+                                 meta_.offset);
 }
 
 template <typename T>
@@ -100,6 +134,7 @@ T* DenseTensor::mutable_data() {
 
 template <typename T>
 const T* DenseTensor::data() const {
+  check_memory_size();
   PADDLE_ENFORCE(
       (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
       paddle::platform::errors::InvalidArgument(
@@ -108,12 +143,37 @@ const T* DenseTensor::data() const {
   return static_cast<const T*>(data());
 }
 
+template <typename T>
+T* DenseTensor::data() {
+  check_memory_size();
+  PADDLE_ENFORCE(
+      (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
+      paddle::platform::errors::InvalidArgument(
+          "The type of data we are trying to retrieve does not match the "
+          "type of data currently contained in the container."));
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  return reinterpret_cast<T*>(data());
+}
+
+void* DenseTensor::data() {
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+                                 meta_.offset);
+}
+
 const void* DenseTensor::data() const {
   PADDLE_ENFORCE_NOT_NULL(
       storage_,
       paddle::platform::errors::PreconditionNotMet(
           "The storage must be valid when call the mutable data function."));
-  return storage_->data();
+  return reinterpret_cast<const void*>(
+      reinterpret_cast<uintptr_t>(storage_->data()) + meta_.offset);
 }
 
 void DenseTensor::set_meta(DenseTensorMeta&& meta) {
@@ -124,16 +184,29 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) {
   meta_ = std::move(meta);
 }
 
+/* @jim19930609: This interface will be further modified util we finalized the
+   design for Allocator - Allocation
+   For now, we have to temporarily accommodate two independent use cases:
+   1. Designed behaviour: DenseTensor constructed with its underlying storage_
+   initialized
+   2. Legacy behaviour(fluid): DenseTensor constructed using default
+   constructor, where
+                               storage_ won't be initialized until the first
+   call to mutable_data(place)
+   */
 void DenseTensor::Resize(const DDim& dims) {
   meta_.dims = dims;
-  mutable_data();
+  if (storage_ != nullptr) {
+    mutable_data();
+  }
 }
 
 void DenseTensor::ResetLoD(const LoD& lod) { meta_.lod = lod; }
 
-#define DATA_MEMBER_FUNC_INSTANTIATION(dtype)  \
-  template dtype* DenseTensor::mutable_data(); \
-  template const dtype* DenseTensor::data() const;
+#define DATA_MEMBER_FUNC_INSTANTIATION(dtype)      \
+  template dtype* DenseTensor::mutable_data();     \
+  template const dtype* DenseTensor::data() const; \
+  template dtype* DenseTensor::data();
 
 DATA_MEMBER_FUNC_INSTANTIATION(bool);
 DATA_MEMBER_FUNC_INSTANTIATION(int8_t);
@@ -153,4 +226,286 @@ DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128);
 
 #undef DATA_MEMBER_FUNC_INSTANTIATION
 
+/* --------------------------- */
+/*   From framework::Tensor    */
+/* --------------------------- */
+DenseTensor::DenseTensor() {
+  storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+      paddle::platform::CPUPlace());
+  inplace_version_counter_ = std::make_shared<TensorInplaceVersion>(0);
+  meta_.dtype = paddle::experimental::DataType::FLOAT32;
+  meta_.offset = 0;
+}
+
+DenseTensor::DenseTensor(const paddle::framework::proto::VarType::Type& dtype) {
+  storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+      paddle::platform::CPUPlace());
+  inplace_version_counter_ = std::make_shared<TensorInplaceVersion>(0);
+  meta_.dtype = TransToPtenDataType(dtype);
+  meta_.offset = 0;
+}
+
+size_t DenseTensor::memory_size() const {
+  if (storage_ == nullptr || storage_->data_shared() == nullptr) {
+    return 0UL;
+  }
+
+  return storage_->data_shared()->size() - meta_.offset;
+}
+
+void DenseTensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(storage_,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "Tensor holds no memory. "
+                              "Call Tensor::mutable_data firstly."));
+  PADDLE_ENFORCE_NOT_NULL(storage_->data_shared(),
+                          paddle::platform::errors::PreconditionNotMet(
+                              "Tensor holds no memory. "
+                              "Call Tensor::mutable_data firstly."));
+  size_t size = numel() * SizeOf(dtype());
+
+  PADDLE_ENFORCE_LE(
+      size,
+      memory_size(),
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor's dimension is out of bound."
+          "Tensor's dimension must be equal or less than the size of its "
+          "memory."
+          "But received  Tensor's dimension is d%, memory's size is %d.",
+          size,
+          memory_size()));
+}
+
+const paddle::platform::Place& DenseTensor::place() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor not initialized yet when Tensor::place() is called."));
+  return storage_->place();
+}
+
+paddle::framework::proto::VarType::Type DenseTensor::type() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor not initialized yet when Tensor::type() is called."));
+  return TransToProtoVarType(meta_.dtype);
+}
+
+paddle::framework::proto::VarType::Type DenseTensor::saved_type() const {
+  return TransToProtoVarType(meta_.dtype);
+}
+
+void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
+  meta_.layout = layout;
+}
+
+void DenseTensor::ResetHolder(
+    const std::shared_ptr<paddle::memory::Allocation>& holder) {
+  PADDLE_ENFORCE_EQ(
+      meta_.offset,
+      0,
+      paddle::platform::errors::Fatal(
+          "Only the offset is supported to zero when the holder is reset."));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+
+  if (storage_->data_shared()) {
+    PADDLE_ENFORCE_LE(
+        numel() * SizeOf(dtype()) + meta_.offset,
+        holder->size(),
+        paddle::platform::errors::InvalidArgument(
+            "The size of Holder is not enough to store the Tensor."));
+  }
+
+  storage_->set_data_shared(holder);
+}
+
+void DenseTensor::ResetHolderWithType(
+    const std::shared_ptr<paddle::memory::Allocation>& holder,
+    const paddle::framework::proto::VarType::Type& type) {
+  set_type(type);
+  ResetHolder(holder);
+}
+
+void DenseTensor::set_type(
+    const paddle::framework::proto::VarType::Type& type) {
+  meta_.dtype = TransToPtenDataType(type);
+}
+
+void* DenseTensor::mutable_data(const paddle::platform::Place& place,
+                                paddle::framework::proto::VarType::Type type,
+                                size_t requested_size) {
+  set_type(type);
+  PADDLE_ENFORCE_GE(
+      numel(),
+      0,
+      paddle::platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(),
+          "] now"));
+  size_t size = numel() * SizeOf(dtype());
+  if (requested_size && (requested_size > size)) {
+    size = requested_size;
+  }
+
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(place);
+  }
+
+  /* some versions of boost::variant don't have operator!= */
+  if (storage_->data_shared() == nullptr ||
+      !(storage_->data_shared()->place() == place) ||
+      storage_->data_shared()->size() < size + meta_.offset) {
+    storage_->Clear();
+    storage_->set_data_shared(paddle::memory::AllocShared(place, size));
+    meta_.offset = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+                                 meta_.offset);
+}
+
+void* DenseTensor::mutable_data(const paddle::platform::Place& place,
+                                size_t requested_size) {
+  return mutable_data(place, type(), requested_size);
+}
+
+void* DenseTensor::mutable_data(const paddle::platform::Place& place,
+                                paddle::framework::proto::VarType::Type type,
+                                const paddle::platform::Stream& stream) {
+  set_type(type);
+  PADDLE_ENFORCE_GE(
+      numel(),
+      0,
+      paddle::platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(),
+          "] now"));
+  size_t size = numel() * SizeOf(dtype());
+
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(place);
+  }
+
+  /* some versions of boost::variant don't have operator!= */
+  if (storage_->data_shared() == nullptr ||
+      !(storage_->data_shared()->place() == place) ||
+      storage_->data_shared()->size() < size + meta_.offset ||
+      !(paddle::platform::is_gpu_place(place) &&
+        paddle::memory::InSameStream(storage_->data_shared(), stream))) {
+    storage_->Clear();
+    storage_->set_data_shared(paddle::memory::AllocShared(place, size, stream));
+    meta_.offset = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(storage_->data()) +
+                                 meta_.offset);
+}
+
+/* @jim19930609: The following "mutable_data" only supports specific dtypes
+   defined in OpProto. This part need another clean up once the data type across
+   Fluid
+   and Pten get unified.
+   */
+template <typename T>
+inline T* DenseTensor::mutable_data(const DDim& dims,
+                                    const paddle::platform::Place& place,
+                                    size_t requested_size) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  meta_.dims = dims;
+  return mutable_data<T>(place, requested_size);
+}
+
+template <typename T>
+inline T* DenseTensor::mutable_data(const paddle::platform::Place& place,
+                                    size_t requested_size) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  return reinterpret_cast<T*>(mutable_data(
+      place, paddle::framework::DataTypeTrait<T>::DataType(), requested_size));
+}
+
+void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
+  if (storage_ != nullptr && tensor.storage_ != nullptr) {
+    storage_->set_data_shared(tensor.storage_->data_shared());
+  }
+  meta_.offset = tensor.meta().offset;
+}
+
+#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \
+  template dtype* DenseTensor::mutable_data(         \
+      const DDim& dims,                              \
+      const paddle::platform::Place& place,          \
+      size_t requested_size);                        \
+  template dtype* DenseTensor::mutable_data(         \
+      const paddle::platform::Place& place, size_t requested_size);
+
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(uint8_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int16_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int64_t)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(float)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(double)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64)
+LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128)
+
+#undef LEGACY_DATA_MEMBER_FUNC_INSTANTIATION
+
+/* ------------------------------ */
+/*   From framework::LoDTensor    */
+/* ------------------------------ */
+
+DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; }
+
+void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; }
+
+LoD* DenseTensor::mutable_lod() { return &meta_.lod; }
+
+std::pair<size_t, size_t> DenseTensor::lod_element(size_t level,
+                                                   size_t elem) const {
+  PADDLE_ENFORCE_LT(
+      level,
+      NumLevels(),
+      paddle::platform::errors::InvalidArgument(
+          "The input level of LoD is invalid, it should be less than LoD "
+          "size. The input level is %zu, the LoD size is %zu.",
+          level,
+          NumLevels()));
+
+  PADDLE_ENFORCE_LT(elem,
+                    NumElements(level),
+                    paddle::platform::errors::InvalidArgument(
+                        "The input element of LoD is invalid, it should be "
+                        "less than the number of elements in its level."
+                        "The input element is %zu, the number of elements in "
+                        "its level is %zu.",
+                        elem,
+                        NumElements(level)));
+
+  return std::make_pair((meta_.lod)[level][elem], (meta_.lod)[level][elem + 1]);
+}
+
+size_t DenseTensor::NumLevels() const { return meta_.lod.size(); }
+
+size_t DenseTensor::NumElements(size_t level) const {
+  PADDLE_ENFORCE_LT(
+      level,
+      NumLevels(),
+      paddle::platform::errors::InvalidArgument(
+          "The input level of LoD is invalid, it should be less than LoD "
+          "size. The input level is %zu, the LoD size is %zu.",
+          level,
+          NumLevels()));
+
+  // the last offset is the end of last element
+  return (meta_.lod)[level].size() - 1;
+}
+
 }  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 92c8e3d4bdbdf..fc92e84f52cea 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -14,15 +14,44 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/stream/stream.h"
+
 #include "paddle/pten/core/allocator.h"
 #include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"
 
+/* @jim19930609: Move to MKLDNN_Tensor in the future
+    */
+#ifdef PADDLE_WITH_MKLDNN
+#include "dnnl.hpp"
+#endif
+
 namespace pten {
 
 class CompatibleDenseTensorUtils;
 
+/* --------------------------- */
+/*   From framework::Tensor    */
+/* --------------------------- */
+/* Temporarily put TensorInplaceVersion inside DenseTensor.
+   Will move to AutogradMeta as soon as we switch to Eager Dygraph.
+   */
+class TensorInplaceVersion {
+ public:
+  explicit TensorInplaceVersion(uint32_t inplace_version = 0)
+      : inplace_version_(inplace_version) {}
+  bool IsUnique() const { return inplace_version_ == 0; }
+  void Bump() { ++inplace_version_; }
+  uint32_t CurrentVersion() const { return inplace_version_; }
+  void SetInplaceVersionToZero() { inplace_version_ = 0; }
+
+ private:
+  uint32_t inplace_version_;
+};
+
 /// \brief The Dense tensor store values in a contiguous sequential block
 /// of memory where all values are represented. Tensors or multi-dimensional
 /// arrays are used in math operators.
@@ -56,15 +85,17 @@ class DenseTensor : public TensorBase,
   /// \brief Because dense tensor is a kind of container, we give a default
   /// constructor to use for stl container. But the dense tensor created with
   /// the default constructor is not practical.
-  DenseTensor() = default;
+  // DenseTensor() = default;
 
   /// \brief Because dense tensor is a resource handle, we provide a default
   /// move constructor to support move semantics.
   DenseTensor(DenseTensor&& other) = default;
 
-  /// \brief We do not recommend deep copy of dense tensor because of its
-  /// efficiency and complexity across devices. The operation is disabled here.
-  DenseTensor(const DenseTensor& other) = delete;
+  /// \brief DenseTensor shallow copy constructor.
+  DenseTensor(const DenseTensor& other);
+
+  /// \brief DenseTensor shallow copy assignment.
+  DenseTensor& operator=(const DenseTensor& other);
 
   /// \brief Destroy the tensor object and release exclusive resources.
   virtual ~DenseTensor() = default;
@@ -84,9 +115,7 @@ class DenseTensor : public TensorBase,
 
   /// \brief Returns the lod of the tensor.
   /// \return The lod of the tensor.
-  const std::vector<std::vector<size_t>>& lod() const noexcept {
-    return meta_.lod;
-  }
+  const LoD& lod() const noexcept { return meta_.lod; }
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
@@ -98,7 +127,7 @@ class DenseTensor : public TensorBase,
 
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
-  const Place& place() const override { return storage_->place(); }
+  const Place& place() const override;
 
   /// \brief Returns the meta information of the tensor.
   /// \return The meta information of the tensor.
@@ -127,6 +156,7 @@ class DenseTensor : public TensorBase,
   /// larger than the original value, the storage area will be reallocated.
   /// \param dims The new dims of the dense tensor.
   /// \param lod The new lod of the dense tensor.
+  // void Resize(const DDim& dims);
   void Resize(const DDim& dims);
 
   /// \brief Change the lod information in the metadata.
@@ -174,9 +204,170 @@ class DenseTensor : public TensorBase,
  private:
   friend class CompatibleDenseTensorUtils;
 
- private:
+ protected:
   DenseTensorMeta meta_;
   intrusive_ptr<Storage> storage_;
+
+  /* --------------------------- */
+  /*   From framework::Tensor    */
+  /* --------------------------- */
+  /* The following members & interfaces were copied from framework::Tensor,
+     so as to facilitate the unification of different Tensors
+
+     Will be adjusted/removed/moved in the near future
+   */
+ public:
+  /* @jim19930609: The way default constructor handles allocator might change,
+     according to
+                   the final design of Allocation - Allocator.
+   */
+  DenseTensor();
+
+  /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
+   */
+  explicit DenseTensor(const paddle::framework::proto::VarType::Type& dtype);
+
+  inline bool IsInitialized() const {
+    return storage_ != nullptr && storage_->data_shared() != nullptr;
+  }
+
+  template <typename T>
+  T* data();
+
+  void* data();
+
+  template <typename T>
+  T* mutable_data(const paddle::platform::Place& place,
+                  size_t requested_size = 0);
+
+  template <typename T>
+  T* mutable_data(const DDim& dims,
+                  const paddle::platform::Place& place,
+                  size_t requested_size = 0);
+
+  void* mutable_data(const paddle::platform::Place& place,
+                     paddle::framework::proto::VarType::Type type,
+                     size_t requested_size = 0);
+
+  void* mutable_data(const paddle::platform::Place& place,
+                     size_t requested_size = 0);
+
+  void* mutable_data(const paddle::platform::Place& place,
+                     paddle::framework::proto::VarType::Type type,
+                     const paddle::platform::Stream& stream);
+
+  /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
+   */
+  paddle::framework::proto::VarType::Type type() const;
+
+  /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
+   */
+  paddle::framework::proto::VarType::Type saved_type() const;
+
+  // memory size returns the holding memory size in byte.
+  size_t memory_size() const;
+
+  void check_memory_size() const;
+
+  void set_layout(const paddle::framework::DataLayout layout);
+
+  void clear() {
+    storage_.reset();
+    meta_.offset = 0;
+  }
+
+  void ShareBufferWith(const DenseTensor& tensor);
+
+  void ShareDataTypeWith(const DenseTensor& tensor) {
+    meta_.dtype = tensor.meta().dtype;
+  }
+
+  bool IsSharedBufferWith(const DenseTensor& src) const {
+    if (storage_ == nullptr || src.storage_ == nullptr) return false;
+    if (storage_->data_shared() == src.storage_->data_shared()) return true;
+
+    return false;
+  }
+
+  const std::shared_ptr<paddle::memory::Allocation> Holder() const {
+    return storage_ == nullptr ? nullptr : std::move(storage_->data_shared());
+  }
+
+  void set_offset(size_t offset) { meta_.offset = offset; }
+  size_t offset() const { return meta_.offset; }
+
+  std::shared_ptr<paddle::memory::Allocation> MoveMemoryHolder() {
+    return storage_ == nullptr ? nullptr
+                               : std::move(storage_->move_data_shared());
+  }
+
+  void ResetHolder(const std::shared_ptr<paddle::memory::Allocation>& holder);
+
+  void ResetHolderWithType(
+      const std::shared_ptr<paddle::memory::Allocation>& holder,
+      const paddle::framework::proto::VarType::Type& type);
+
+  void set_type(const paddle::framework::proto::VarType::Type& type);
+
+  TensorInplaceVersion& InplaceVersionCounter() {
+    return *inplace_version_counter_;
+  }
+
+ protected:
+  std::shared_ptr<TensorInplaceVersion> inplace_version_counter_;
+
+/* @jim19930609: This is a hack
+   In general, it is badly designed to fuse MKLDNN-specific objects into a
+   generic Tensor.
+   We temporarily leave them here to unblock Tensor Unification progress.
+   In the final state, we should come up with a MKLDNN_Tensor and move the
+   following codes there.
+   */
+#ifdef PADDLE_WITH_MKLDNN
+
+ public:
+  inline dnnl::memory::format_tag format() const { return format_; }
+
+  inline void set_format(const dnnl::memory::format_tag format) {
+    format_ = format;
+  }
+
+ protected:
+  /**
+   * @brief the detail format of memory block which have layout as kMKLDNN
+   *
+   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
+   */
+
+  dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef;
+#endif
+
+  /* ------------------------------ */
+  /*   From framework::LoDTensor    */
+  /* ------------------------------ */
+  /* The following members & interfaces were copied from framework::Tensor,
+     so as to facilitate the unification of different Tensors
+
+     Will be adjusted/removed/moved in the near future
+   */
+ public:
+  explicit DenseTensor(const LoD& lod);
+
+  void set_lod(const LoD& lod);
+
+  LoD* mutable_lod();
+
+  /*
+   * Get the start offset and end offset of an  element from LoD.
+   */
+  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const;
+
+  size_t NumLevels() const;
+
+  size_t NumElements(size_t level = 0) const;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
index 3b8347dec772e..56f7eea7ea802 100644
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -27,12 +27,14 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"fill_any_like", "full_like"},
     {"fill_constant", "full"},
     {"flatten_contiguous_range", "flatten"},
+    {"flatten_contiguous_range_grad", "flatten_grad"},
     {"matmul_v2", "matmul"},
     {"reduce_mean", "mean"},
     {"reduce_sum", "sum"},
     {"reshape2", "reshape"},
     // fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
     {"flatten", "deprecated"},
+    {"flatten_grad", "deprecated"},
     {"matmul", "deprecated"},
     {"mean", "deprecated"},
     {"reshape", "deprecated"},
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index a33b13dac2397..bd4687c6e7f4e 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -749,6 +749,8 @@ struct KernelRegistrar {
  * layout, so the layout also need to be a part of symbol var name. If developer
  * register 2 kernel with same name, backend, layout and diff dtype, he should
  * use another register marco PT_REGISTER_KERNEL.
+ *
+ * TODO(chenweihang): remove this marco later
  */
 #define PT_REGISTER_NO_TEMPLATE_KERNEL(                                      \
     kernel_name, backend, layout, kernel_fn, dtype)                          \
@@ -772,6 +774,60 @@ struct KernelRegistrar {
   void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(         \
       ::pten::Kernel* kernel)
 
+/** PT_REGISTER_GENERAL_KERNEL
+ *
+ * Basic Kernel register marco, used to register a instantiated kernel function
+ * with one template argument.
+ */
+
+#define PT_REGISTER_GENERAL_KERNEL(                                          \
+    kernel_name, backend, layout, kernel_fn, dtype)                          \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
+  _PT_REGISTER_GENERAL_KERNEL(kernel_name, backend, layout, kernel_fn, dtype)
+
+#ifndef _WIN32
+#define _PT_REGISTER_GENERAL_KERNEL(                                        \
+    kernel_name, backend, layout, kernel_fn, dtype)                         \
+  template decltype(kernel_fn) kernel_fn;                                   \
+  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+      ::pten::Kernel*);                                                     \
+  static const ::pten::KernelRegistrar                                      \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
+          #kernel_name,                                                     \
+          BACKEND(backend),                                                 \
+          DATALAYOUT(layout),                                               \
+          ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,      \
+          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PT_KERNEL(kernel_fn),                                             \
+          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+  int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
+    return 0;                                                               \
+  }                                                                         \
+  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+      ::pten::Kernel* kernel)
+#else
+#define _PT_REGISTER_GENERAL_KERNEL(                                        \
+    kernel_name, backend, layout, kernel_fn, dtype)                         \
+  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
+      ::pten::Kernel*);                                                     \
+  static const ::pten::KernelRegistrar                                      \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
+          #kernel_name,                                                     \
+          BACKEND(backend),                                                 \
+          DATALAYOUT(layout),                                               \
+          ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,      \
+          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
+          PT_KERNEL(kernel_fn),                                             \
+          PT_VARIADIC_KERNEL(kernel_fn));                                   \
+  int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {         \
+    return 0;                                                               \
+  }                                                                         \
+  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
+      ::pten::Kernel* kernel)
+#endif
+
 /** PT_REGISTER_CTX_KERNEL
  *
  * Used for kernel registration with device context and data type as
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 7a7ae283304bf..5087d912ed525 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -183,9 +183,6 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(NPUContext);
-#endif
 #ifdef PADDLE_WITH_XPU
   PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
diff --git a/paddle/pten/core/storage.cc b/paddle/pten/core/storage.cc
index 5cac122b7dee6..f7c7f68734101 100644
--- a/paddle/pten/core/storage.cc
+++ b/paddle/pten/core/storage.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 namespace pten {
 
 void TensorStorage::Realloc(size_t size) {
-  data_.Clear();
-  data_ = Allocate(alloc_, size);
+  this->Clear();
+  data_ = paddle::memory::AllocShared(alloc_->place(), size);
   size_ = size;
 }
 
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index ef9e22a0804e7..fc56935eeaf19 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/pten/core/utils/intrusive_ref_counter.h"
 #include "paddle/pten/core/utils/type_info.h"
 
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/pten/core/allocator.h"
 
@@ -35,14 +36,46 @@ class Storage : public intrusive_ref_counter<Storage> {
   Storage() = default;
   Storage(const Storage&) = delete;
 
-  explicit Storage(Allocation&& data) : data_(std::move(data)) {}
+  /* @jim19930609: Following interfaces will be modified/replaced/removed
+                   as soon as the new Allocation - Allocator design get
+     finalized.
+    */
 
-  virtual ~Storage() = default;
+  /*   --------- shared_ptr<Allocation> -------- */
+  // Initialize a Storage with unique Allocation
+  explicit Storage(std::shared_ptr<paddle::memory::Allocation>&& data)
+      : data_(std::move(data)) {}
+
+  // Initialize a Storage shareing Allocation with another storage
+  explicit Storage(const std::shared_ptr<paddle::memory::Allocation>& data)
+      : data_(data) {}
+
+  void* data() const {
+    return data_ ? reinterpret_cast<void*>(
+                       reinterpret_cast<uintptr_t>(data_->ptr()))
+                 : nullptr;
+  }
 
-  /// \brief Get the mutable data pointer of the storage.
-  /// This function is set to inline to improve performance.
-  /// \return The mutable data pointer of the storage.
-  void* data() const noexcept { return data_.operator->(); }
+  const std::shared_ptr<paddle::memory::Allocation> data_shared() const {
+    return data_;
+  }
+
+  virtual void set_data_shared(
+      const std::shared_ptr<paddle::memory::Allocation>& holder) {
+    data_ = holder;
+  }
+
+  std::shared_ptr<paddle::memory::Allocation> move_data_shared() {
+    return std::move(data_);
+  }
+
+  virtual void ReallocShared(size_t n) {
+    PADDLE_THROW(paddle::platform::errors::Unimplemented(
+        "ReallocShared has not been overrided by the current Storage"));
+  }
+  /* --------- shared_ptr<Allocation> -------- */
+
+  virtual ~Storage() = default;
 
   virtual void Clear() = 0;
 
@@ -52,7 +85,7 @@ class Storage : public intrusive_ref_counter<Storage> {
   virtual void Realloc(size_t n) = 0;
 
  protected:
-  Allocation data_;
+  std::shared_ptr<paddle::memory::Allocation> data_;
 };
 
 class TensorStorage : public Storage {
@@ -60,23 +93,37 @@ class TensorStorage : public Storage {
   using Place = paddle::platform::Place;
 
   explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
+
   TensorStorage(const std::shared_ptr<Allocator>& a, size_t size)
-      : Storage(Allocate(a, size)), alloc_(a), size_(size) {}
+      : Storage(paddle::memory::AllocShared(a->place(), size)), alloc_(a) {
+    size_ = data_->size();
+  }
+
+  void Clear() override {
+    data_ = nullptr;
+    size_ = 0;
+  }
+
+  void Realloc(size_t size) override;
 
   ~TensorStorage() = default;
 
   static const char* name() { return "TensorStorage"; }
 
-  void Realloc(size_t size) override;
-
   size_t size() const noexcept override { return size_; }
 
-  void Clear() override {
-    data_.Clear();
-    size_ = 0;
+  const Place& place() const override {
+    if (!data_ && !alloc_) {
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unable to visit place: either data_ or alloc_ has to be initialized "
+          "first."));
+    }
+    if (data_) {
+      return data_->place();
+    }
+    return alloc_->place();
   }
 
-  const Place& place() const override { return data_.place(); }
   bool OwnsMemory() const noexcept override { return true; }
   const std::shared_ptr<Allocator>& allocator() const noexcept {
     return alloc_;
diff --git a/paddle/pten/core/tensor_meta.cc b/paddle/pten/core/tensor_meta.cc
index 3e06508be69d6..844387bec5c58 100644
--- a/paddle/pten/core/tensor_meta.cc
+++ b/paddle/pten/core/tensor_meta.cc
@@ -21,14 +21,16 @@ DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims)
 
 DenseTensorMeta::DenseTensorMeta(DataType dtype,
                                  const DDim& dims,
-                                 DataLayout layout)
-    : dims(dims), dtype(dtype), layout(layout) {}
+                                 DataLayout layout,
+                                 size_t offset)
+    : dims(dims), dtype(dtype), layout(layout), offset(offset) {}
 
 DenseTensorMeta::DenseTensorMeta(DataType dtype,
                                  const DDim& dims,
                                  DataLayout layout,
-                                 const std::vector<std::vector<size_t>>& lod)
-    : dims(dims), dtype(dtype), layout(layout), lod(lod) {}
+                                 const LoD& lod,
+                                 size_t offset)
+    : dims(dims), dtype(dtype), layout(layout), lod(lod), offset(offset) {}
 
 bool DenseTensorMeta::valid() const noexcept {
   bool valid{true};
@@ -38,10 +40,4 @@ bool DenseTensorMeta::valid() const noexcept {
   return valid;
 }
 
-bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) {
-  bool ret = true;
-  return ret && (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) &&
-         (lhs.dtype == rhs.dtype) && (lhs.layout == rhs.layout) &&
-         (lhs.lod == rhs.lod) && (lhs.offset == rhs.offset);
-}
 }  // namespace pten
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index cc02c57a48ba1..2df6b48b674a7 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -22,15 +22,16 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
+
 // Note: mixed_vector include many header now, LoD will be
 // used on CUDA device? Can we use small_vector here?
-// #include "paddle/fluid/framework/mixed_vector.h"
+// @zhanlve: Rollback to original LoD for now
+#include "paddle/fluid/framework/mixed_vector.h"
 
 namespace pten {
 
 using DDim = paddle::framework::DDim;
-using LoD = std::vector<std::vector<size_t>>;
-
+using LoD = std::vector<paddle::framework::Vector<size_t>>;
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
 ///
@@ -40,18 +41,20 @@ struct DenseTensorMeta {
 
   DenseTensorMeta() = default;
   DenseTensorMeta(DataType dtype, const DDim& dims);
-  DenseTensorMeta(DataType dtype, const DDim& dims, DataLayout layout);
   DenseTensorMeta(DataType dtype,
                   const DDim& dims,
                   DataLayout layout,
-                  const std::vector<std::vector<size_t>>& lod);
+                  size_t offset = 0);
+  DenseTensorMeta(DataType dtype,
+                  const DDim& dims,
+                  DataLayout layout,
+                  const LoD& lod,
+                  size_t offset = 0);
 
   /// \brief Test whether the metadata is valid. Does not throw exceptions.
   /// \return Whether the metadata is valid.
   bool valid() const noexcept;
 
-  /// During the entire life cycle of a DenseTensor, the following attributes
-  /// marked with `const` are expected to remain unchanged.
   bool is_scalar{false};
   DDim dims;
   DataType dtype{DataType::UNDEFINED};
@@ -60,4 +63,10 @@ struct DenseTensorMeta {
   size_t offset{0};
 };
 
+inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) {
+  return (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) &&
+         (lhs.dtype == rhs.dtype) && (lhs.layout == rhs.layout) &&
+         (lhs.lod == rhs.lod) && (lhs.offset == rhs.offset);
+}
+
 }  // namespace pten
diff --git a/paddle/pten/core/utils/intrusive_ptr.h b/paddle/pten/core/utils/intrusive_ptr.h
index f0e94fadac973..ed9a21e7f3a8a 100644
--- a/paddle/pten/core/utils/intrusive_ptr.h
+++ b/paddle/pten/core/utils/intrusive_ptr.h
@@ -40,6 +40,11 @@ class intrusive_ptr {
     rhs.reset();
   }
 
+  intrusive_ptr& operator=(intrusive_ptr&& rhs) {
+    swap(rhs);
+    return *this;
+  }
+
   void reset() { this_type().swap(*this); }
 
   void reset(T* rhs) { this_type(rhs).swap(*this); }
diff --git a/paddle/pten/include/infermeta.h b/paddle/pten/include/infermeta.h
index 151cb638d85b7..5e356dd37c03e 100644
--- a/paddle/pten/include/infermeta.h
+++ b/paddle/pten/include/infermeta.h
@@ -16,5 +16,6 @@ limitations under the License. */
 
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/infermeta/binary.h"
-#include "paddle/pten/infermeta/nary.h"
+#include "paddle/pten/infermeta/multiary.h"
+#include "paddle/pten/infermeta/nullary.h"
 #include "paddle/pten/infermeta/unary.h"
diff --git a/paddle/pten/include/linalg.h b/paddle/pten/include/linalg.h
index 8f627f5fc8b0a..22f287468e673 100644
--- a/paddle/pten/include/linalg.h
+++ b/paddle/pten/include/linalg.h
@@ -17,8 +17,7 @@
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/cpu/linalg.h"
-#include "paddle/pten/kernels/gpu/linalg.h"
+#include "paddle/pten/kernels/dot_kernel.h"
 
 namespace pten {
 
@@ -31,7 +30,7 @@ DenseTensor Dot(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  Dot<T>(dev_ctx, x, y, &dense_out);
+  Dot<T, ContextT>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/include/manipulation.h b/paddle/pten/include/manipulation.h
deleted file mode 100644
index e94f2a6180749..0000000000000
--- a/paddle/pten/include/manipulation.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/cpu/manipulation.h"
-#include "paddle/pten/kernels/gpu/manipulation.h"
-#include "paddle/pten/kernels/xpu/manipulation.h"
-
-namespace pten {
-
-template <typename T, typename ContextT>
-DenseTensor Flatten(const ContextT& dev_ctx,
-                    const DenseTensor& x,
-                    int start_axis,
-                    int stop_axis) {
-  auto out_meta = FlattenInferMeta(x.meta(), start_axis, stop_axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Flatten<T>(dev_ctx, x, start_axis, stop_axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Cast(const ContextT& dev_ctx,
-                 const DenseTensor& x,
-                 DataType out_dtype,
-                 DataType in_dtype) {
-  auto out_meta = CastInferMeta(x.meta(), out_dtype);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Cast<T>(dev_ctx, x, out_dtype, in_dtype, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Reshape(const ContextT& dev_ctx,
-                    const DenseTensor& x,
-                    const std::vector<int64_t>& shape) {
-  auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Reshape(dev_ctx, x, ScalarArray(shape), &dense_out);
-  return dense_out;
-}
-
-}  // namespace pten
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index 83471692c8746..faa4c8db8dac3 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -17,10 +17,7 @@ limitations under the License. */
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/cpu/conj_kernel.h"
-#include "paddle/pten/kernels/cpu/math.h"
-#include "paddle/pten/kernels/gpu/conj_kernel.h"
-#include "paddle/pten/kernels/gpu/math.h"
+#include "paddle/pten/kernels/complex_kernel.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace pten {
@@ -36,41 +33,6 @@ DenseTensor Sign(const ContextT& dev_ctx, const DenseTensor& x) {
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Mean(const ContextT& dev_ctx,
-                 const DenseTensor& x,
-                 const std::vector<int64_t>& axis,
-                 bool keep_dim) {
-  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  bool reduce_all = false;
-  Mean<T>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Sum(const ContextT& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& axis,
-                DataType dtype,
-                bool keep_dim) {
-  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      out_meta);
-
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  Sum<T>(dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Scale(const ContextT& dev_ctx,
                   const DenseTensor& x,
@@ -86,62 +48,6 @@ DenseTensor Scale(const ContextT& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Add(const ContextT& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y,
-                int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Add<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Subtract(const ContextT& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Subtract<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Divide(const ContextT& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Divide<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename ContextT>
-DenseTensor Multiply(const ContextT& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Multiply<T>(dev_ctx, x, y, axis, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Conj(const ContextT& dev_ctx, const DenseTensor& x) {
   auto out_meta = UnchangedInferMeta(x.meta());
diff --git a/paddle/pten/infermeta/CMakeLists.txt b/paddle/pten/infermeta/CMakeLists.txt
index b32ec0a51c736..f92727f33fb05 100644
--- a/paddle/pten/infermeta/CMakeLists.txt
+++ b/paddle/pten/infermeta/CMakeLists.txt
@@ -1,3 +1 @@
-cc_library(nary SRCS nary.cc DEPS convert_utils)
-cc_library(unary SRCS unary.cc DEPS convert_utils)
-cc_library(binary SRCS binary.cc DEPS convert_utils)
+cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils)
diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc
index 5d3844a1dec3d..944c64ecd75e2 100644
--- a/paddle/pten/infermeta/binary.cc
+++ b/paddle/pten/infermeta/binary.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/infermeta/binary.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
 
 namespace pten {
 
@@ -162,13 +162,13 @@ DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta,
     std::vector<int> x_dims_array(max_dim);
     std::vector<int> y_dims_array(max_dim);
     std::vector<int> out_dims_array(max_dim);
-    general::GetBroadcastDimsArrays(x_dims,
-                                    y_dims,
-                                    x_dims_array.data(),
-                                    y_dims_array.data(),
-                                    out_dims_array.data(),
-                                    max_dim,
-                                    axis);
+    funcs::GetBroadcastDimsArrays(x_dims,
+                                  y_dims,
+                                  x_dims_array.data(),
+                                  y_dims_array.data(),
+                                  out_dims_array.data(),
+                                  max_dim,
+                                  axis);
     return_meta.dims = paddle::framework::make_ddim(out_dims_array);
   }
   return_meta.lod = x_meta.lod;
diff --git a/paddle/pten/infermeta/multiary.cc b/paddle/pten/infermeta/multiary.cc
new file mode 100644
index 0000000000000..5dbf3d58a1952
--- /dev/null
+++ b/paddle/pten/infermeta/multiary.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/infermeta/multiary.h"
+
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/infermeta/multiary.h b/paddle/pten/infermeta/multiary.h
new file mode 100644
index 0000000000000..6aa15159630bc
--- /dev/null
+++ b/paddle/pten/infermeta/multiary.h
@@ -0,0 +1,17 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pten {}  // namespace pten
diff --git a/paddle/pten/infermeta/nary.cc b/paddle/pten/infermeta/nullary.cc
similarity index 70%
rename from paddle/pten/infermeta/nary.cc
rename to paddle/pten/infermeta/nullary.cc
index 8b12a88f10fc0..731e69e60907b 100644
--- a/paddle/pten/infermeta/nary.cc
+++ b/paddle/pten/infermeta/nullary.cc
@@ -13,20 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/pten/infermeta/nary.h"
+#include "paddle/pten/infermeta/nullary.h"
 
 namespace pten {
 
-DenseTensorMeta FullInferMeta(const std::vector<int64_t>& shape,
-                              DataType dtype,
-                              DataLayout layout) {
+DenseTensorMeta CreateInferMeta(const std::vector<int64_t>& shape,
+                                DataType dtype,
+                                DataLayout layout) {
   const auto& out_dims = paddle::framework::make_ddim(shape);
   return {dtype, out_dims, layout};
 }
 
-DenseTensorMeta FullInferMeta(const ScalarArray& shape,
-                              DataType dtype,
-                              DataLayout layout) {
+DenseTensorMeta CreateInferMeta(const ScalarArray& shape,
+                                DataType dtype,
+                                DataLayout layout) {
   const auto& out_dims = paddle::framework::make_ddim(shape.GetData());
   return {dtype, out_dims, layout};
 }
diff --git a/paddle/pten/infermeta/nary.h b/paddle/pten/infermeta/nullary.h
similarity index 76%
rename from paddle/pten/infermeta/nary.h
rename to paddle/pten/infermeta/nullary.h
index 010accd2e79e5..721a39bb3ac31 100644
--- a/paddle/pten/infermeta/nary.h
+++ b/paddle/pten/infermeta/nullary.h
@@ -27,12 +27,12 @@ namespace pten {
 //  Because functions in this file
 //  not only can infer shape, but alse need infer lod or other useful data.
 
-DenseTensorMeta FullInferMeta(const std::vector<int64_t>& shape,
-                              DataType dtype,
-                              DataLayout layout);
+DenseTensorMeta CreateInferMeta(const std::vector<int64_t>& shape,
+                                DataType dtype,
+                                DataLayout layout);
 
-DenseTensorMeta FullInferMeta(const ScalarArray& shape,
-                              DataType dtype,
-                              DataLayout layout);
+DenseTensorMeta CreateInferMeta(const ScalarArray& shape,
+                                DataType dtype,
+                                DataLayout layout);
 
 }  // namespace pten
diff --git a/paddle/pten/infermeta/unary.cc b/paddle/pten/infermeta/unary.cc
index 49d4a24e3a2c4..843a78f3413cf 100644
--- a/paddle/pten/infermeta/unary.cc
+++ b/paddle/pten/infermeta/unary.cc
@@ -81,9 +81,9 @@ DenseTensorMeta CastInferMeta(const DenseTensorMeta& x_meta,
   return out_meta;
 }
 
-DenseTensorMeta FullLikeInferMeta(const DenseTensorMeta& x_meta,
-                                  DataType dtype,
-                                  DataLayout layout) {
+DenseTensorMeta CreateLikeInferMeta(const DenseTensorMeta& x_meta,
+                                    DataType dtype,
+                                    DataLayout layout) {
   return {dtype == DataType::UNDEFINED ? x_meta.dtype : dtype,
           x_meta.dims,
           layout == DataLayout::UNDEFINED ? x_meta.layout : layout};
diff --git a/paddle/pten/infermeta/unary.h b/paddle/pten/infermeta/unary.h
index 3f28b2b48530f..ae42cbd5dd2c6 100644
--- a/paddle/pten/infermeta/unary.h
+++ b/paddle/pten/infermeta/unary.h
@@ -44,9 +44,9 @@ DenseTensorMeta FlattenInferMeta(const DenseTensorMeta& x_meta,
 DenseTensorMeta CastInferMeta(const DenseTensorMeta& x_meta,
                               const DataType out_dtype);
 
-DenseTensorMeta FullLikeInferMeta(const DenseTensorMeta& x_meta,
-                                  DataType dtype,
-                                  DataLayout layout);
+DenseTensorMeta CreateLikeInferMeta(const DenseTensorMeta& x_meta,
+                                    DataType dtype,
+                                    DataLayout layout);
 
 DenseTensorMeta InferMetaFromVecValue(const DenseTensorMeta& x_meta,
                                       const std::vector<int64_t>& shape);
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index 818ce6cb77ae0..b76d408f89e85 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -1,3 +1,9 @@
+include(pten_kernel)
+
+set(kernel_declare_file ${PADDLE_BINARY_DIR}/paddle/pten/kernels/declarations.h.tmp CACHE INTERNAL "declarations.h file")
+set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/pten/kernels/declarations.h)
+file(WRITE ${kernel_declare_file} "// Generated by the paddle/pten/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n")
+
 # kernel primitive api
 add_subdirectory(primitive)
 # pten hybird functors and functions called by kernels
@@ -11,9 +17,26 @@ if(WITH_MKLDNN)
   # mkldnn will be deprecated and use the new name dnnl
   add_subdirectory(dnnl)
 endif()
-if(WITH_ASCEND_CL)
-  add_subdirectory(npu)
-endif()
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
+
+# pten depends all pten kernel targets
+set_property(GLOBAL PROPERTY PTEN_KERNELS "")
+
+set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory convert_utils)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas)
+# remove this dep after removing fluid deps on tensor creation
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
+
+set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel pten_transpose_cpu)
+if(WITH_GPU OR WITH_ROCM)
+  set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_gpu)
+endif()
+
+# auto build kernel targets by cmake
+register_kernels(EXCLUDES math_kernel DEPS ${COMMON_KERNEL_DEPS})
+kernel_library(math_kernel DEPS ${MATH_KERNEL_DEPS})
+
+copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
diff --git a/paddle/pten/kernels/cast_kernel.h b/paddle/pten/kernels/cast_kernel.h
new file mode 100644
index 0000000000000..8fdce9cda6f1d
--- /dev/null
+++ b/paddle/pten/kernels/cast_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void CastKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DataType out_dtype,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Cast(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 DataType out_dtype) {
+  auto out_meta = CastInferMeta(x.meta(), out_dtype);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  CastKernel<T, Context>(dev_ctx, x, out_dtype, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/conj_kernel.h b/paddle/pten/kernels/complex_kernel.h
similarity index 82%
rename from paddle/pten/kernels/cpu/conj_kernel.h
rename to paddle/pten/kernels/complex_kernel.h
index 49dad8f5b2df6..dfe8fff43e6ef 100644
--- a/paddle/pten/kernels/cpu/conj_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -14,12 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
 
-template <typename T>
-void Conj(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+template <typename T, typename Context>
+void Conj(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/utils.h b/paddle/pten/kernels/copy_kernel.h
similarity index 85%
rename from paddle/pten/kernels/cpu/utils.h
rename to paddle/pten/kernels/copy_kernel.h
index 93730692079e3..a481908892e9b 100644
--- a/paddle/pten/kernels/cpu/utils.h
+++ b/paddle/pten/kernels/copy_kernel.h
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
 
 namespace pten {
 
-void Copy(const CPUContext& dev_ctx,
+template <typename Context>
+void Copy(const Context& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst);
diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
index 036ce68ee43c1..e69de29bb2d1d 100644
--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -1,7 +0,0 @@
-cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu)
-cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
-cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
-cc_library(scale_kernel_cpu SRCS scale_kernel.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
-cc_library(full_kernel_cpu SRCS full_kernel.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
-cc_library(conj_kernel_cpu SRCS conj_kernel.cc DEPS dense_tensor kernel_context kernel_factory)
diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc
new file mode 100644
index 0000000000000..c6736cdd1bcf0
--- /dev/null
+++ b/paddle/pten/kernels/cpu/cast_kernel.cc
@@ -0,0 +1,77 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cast_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace pten {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <typename InT, typename OutT>
+void CastKernelImpl(const CPUContext& dev_ctx,
+                    const DenseTensor& x,
+                    DenseTensor* out) {
+  auto* in_begin = x.data<InT>();
+  auto numel = x.numel();
+  auto* in_end = in_begin + numel;
+
+  auto* out_begin = out->mutable_data<OutT>();
+
+  paddle::platform::Transform<CPUContext> trans;
+  trans(dev_ctx,
+        in_begin,
+        in_end,
+        out_begin,
+        CastOpTransformFunctor<InT, OutT>());
+}
+
+template <typename T, typename Context>
+void CastKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DataType out_dtype,
+                DenseTensor* out) {
+  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
+                       CastKernelImpl<T, data_t>(dev_ctx, x, out);
+                     }));
+}
+
+}  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(cast,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::CastKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       int16_t,
+                       bool,
+                       uint8_t,
+                       paddle::platform::float16,
+                       paddle::platform::bfloat16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/pten/kernels/cpu/linalg.h b/paddle/pten/kernels/cpu/complex_kernel.cc
similarity index 56%
rename from paddle/pten/kernels/cpu/linalg.h
rename to paddle/pten/kernels/cpu/complex_kernel.cc
index 29c6cd16cf81a..9bf27ef22dcd7 100644
--- a/paddle/pten/kernels/cpu/linalg.h
+++ b/paddle/pten/kernels/cpu/complex_kernel.cc
@@ -12,28 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "paddle/pten/kernels/complex_kernel.h"
+#include "paddle/pten/kernels/impl/complex_kernel_impl.h"
 
 #include "paddle/pten/backends/cpu/cpu_context.h"
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/complex.h"
 
-namespace pten {
-
-template <typename T>
-void Dot(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out);
-
-template <typename T>
-void Matmul(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            bool transpose_x,
-            bool transpose_y,
-            DenseTensor* out);
-
-}  // namespace pten
+PT_REGISTER_CTX_KERNEL(conj,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::Conj,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>,
+                       float,
+                       double,
+                       int,
+                       int64_t) {}
diff --git a/paddle/pten/kernels/cpu/conj_kernel.cc b/paddle/pten/kernels/cpu/conj_kernel.cc
deleted file mode 100644
index f10d9f761eaed..0000000000000
--- a/paddle/pten/kernels/cpu/conj_kernel.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/pten/kernels/cpu/conj_kernel.h"
-
-#include "paddle/pten/backends/cpu/cpu_context.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/hybird/math/conj_impl.h"
-
-namespace pten {
-
-template <typename T>
-void Conj(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  ConjImpl<T, CPUContext>(dev_ctx, x, out);
-}
-
-}  // namespace pten
-
-PT_REGISTER_KERNEL(conj,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Conj,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/copy_kernel.cc
similarity index 84%
rename from paddle/pten/kernels/cpu/utils.cc
rename to paddle/pten/kernels/cpu/copy_kernel.cc
index 1ca20df4d92dc..f3c4156fcddf0 100644
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
@@ -12,15 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/kernels/cpu/utils.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace pten {
 
 // NOTE(chenweihang): blocking is useless in cpu kernel
-void Copy(const CPUContext& dev_ctx,
+template <typename Context>
+void Copy(const Context& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst) {
@@ -57,4 +63,5 @@ void Copy(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, CPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, CPU, ALL_LAYOUT, pten::Copy<pten::CPUContext>, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc
new file mode 100644
index 0000000000000..247ad1216a266
--- /dev/null
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/dot_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/complex.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void Dot(const Context& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  auto* z = out->mutable_data<T>();
+
+  // Loop over the total N elements of both operands while sum-reducing every
+  // B pairs along the way where B is the dimension of the least ordered axis
+  auto&& d = x.dims();
+  auto const N = x.numel();
+  auto const B = d[d.size() - 1];
+
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+
+}  // namespace pten
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_CTX_KERNEL(dot,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::Dot,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
new file mode 100644
index 0000000000000..d3687b22fb392
--- /dev/null
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -0,0 +1,392 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
+
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+
+namespace pten {
+
+// Add
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsAddFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsAddFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+    blas.VADD(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsAddFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    z->mutable_data<T>();
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+    auto eigen_z = pten::EigenVector<T>::Flatten(*z);
+    auto& place = *dev_ctx.eigen_device();
+    eigen_z.device(place) = eigen_x + eigen_y;
+  }
+};
+
+// Subtract
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsSubtractFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsSubtractFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+    blas.VSUB(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsSubtractFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+    auto eigen_z = pten::EigenVector<T>::Flatten(*z);
+    auto& place = *dev_ctx.eigen_device();
+    eigen_z.device(place) = eigen_x - eigen_y;
+  }
+};
+
+// Divide
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsDivideFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsDivideFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    paddle::platform::errors::InvalidArgument(
+        "If use SameDimsDivideFunctor, template args(T) must be floating "
+        "point. ");
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsDivideFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+    blas.VDIV(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+  }
+};
+
+// Multiply
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsMultiplyFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsMultiplyFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+    blas.VMUL(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsMultiplyFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+    auto eigen_z = pten::EigenVector<T>::Flatten(*z);
+    auto& place = *dev_ctx.eigen_device();
+    eigen_z.device(place) = eigen_x * eigen_y;
+  }
+};
+
+inline void UpdateElementwiseIndexArray(const int* out_dims_array,
+                                        const int max_dim,
+                                        int* index_array) {
+  for (int i = max_dim - 1; i >= 0; --i) {
+    ++index_array[i];
+    if (index_array[i] >= out_dims_array[i]) {
+      index_array[i] -= out_dims_array[i];
+    } else {
+      break;
+    }
+  }
+}
+
+inline int GetElementwiseIndex(const int* x_dims_array,
+                               const int max_dim,
+                               const int* index_array) {
+  int index_ = 0;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] > 1) {
+      index_ = index_ * x_dims_array[i] + index_array[i];
+    }
+  }
+  return index_;
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void CommonForwardBroadcastCPU(const DenseTensor& x,
+                               const DenseTensor& y,
+                               DenseTensor* z,
+                               int* x_dims_array,
+                               int* y_dims_array,
+                               int* out_dims_array,
+                               int max_dim,
+                               const paddle::platform::CPUDeviceContext& ctx,
+                               Functor func,
+                               const bool is_xsize_larger = true) {
+  std::vector<int> index_array(max_dim, 0);
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  PADDLE_ENFORCE_NOT_NULL(x_data,
+                          paddle::platform::errors::InvalidArgument(
+                              "The input X should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(y_data,
+                          paddle::platform::errors::InvalidArgument(
+                              "The input Y should not be empty."));
+  OutType* out_data = z->mutable_data<OutType>();
+
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (is_xsize_larger) {
+      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
+    } else {
+      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
+  }
+}
+
+template <typename Functor, typename T, typename OutType = T>
+void CommonElementwiseBroadcastForward(
+    const paddle::platform::CPUDeviceContext& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    DenseTensor* z,
+    const DDim& x_dims,
+    const DDim& y_dims,
+    Functor func,
+    int axis,
+    const bool is_xsize_larger = true) {
+  int max_dim = (std::max)(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      paddle::platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    paddle::platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  funcs::GetBroadcastDimsArrays(x_dims,
+                                y_dims,
+                                x_dims_array.data(),
+                                y_dims_array.data(),
+                                out_dims_array.data(),
+                                max_dim,
+                                axis);
+
+  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
+                                                 y,
+                                                 z,
+                                                 x_dims_array.data(),
+                                                 y_dims_array.data(),
+                                                 out_dims_array.data(),
+                                                 max_dim,
+                                                 dev_ctx,
+                                                 func,
+                                                 is_xsize_larger);
+}
+
+// It is a common CPU implementation to compute binary calculation with the
+// support of broadcast. Note:
+// 1. CPU implementation cannot support the case when x needs broadcast, thus
+//    this function need to be called with XxxFunctor and XxxInverseFunctor,
+//    like AddFunctor and InverseAddFunctor.
+// 2. The corresponding GPU implementation supports all the broadcast cases,
+//    thus there is no need to define and call with XxxInverseFunctor.
+// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
+// cases and avoid the need of XxxInverseFunctor.
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const paddle::platform::CPUDeviceContext& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        int axis,
+                        Functor func,
+                        DenseTensor* z) {
+  z->mutable_data<OutType>();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  bool is_xsize_larger = true;
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+  funcs::
+      TransformFunctor<Functor, T, paddle::platform::CPUDeviceContext, OutType>
+          functor(x, y, z, dev_ctx, func, is_xsize_larger);
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      paddle::platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    paddle::platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    funcs::get_mid_dims(x_dims,
+                        y_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    funcs::get_mid_dims(y_dims,
+                        x_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  }
+  // special case for common implementation.
+  // case 1: x=[2,3,1,5], y=[2,1,4,1]
+  // case 2: x=[2,3,4], y=[1,1,4]
+  if (is_run_common_broadcast == 1) {
+    CommonElementwiseBroadcastForward<Functor, T, OutType>(
+        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
+    return;
+  }
+
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
+  }
+}
+
+template <typename Functor>
+struct SameDimsElementwiseCompute {
+  void operator()(const paddle::platform::CPUDeviceContext& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    Functor()(dev_ctx, x, y, z);
+  }
+};
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/full_kernel.cc b/paddle/pten/kernels/cpu/full_kernel.cc
index 4912656bb2aef..1ae8001d79dc7 100644
--- a/paddle/pten/kernels/cpu/full_kernel.cc
+++ b/paddle/pten/kernels/cpu/full_kernel.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 PT_REGISTER_CTX_KERNEL(full,
                        CPU,
                        ALL_LAYOUT,
-                       pten::Full,
+                       pten::FullKernel,
                        float,
                        double,
                        uint8_t,
@@ -37,7 +37,7 @@ PT_REGISTER_CTX_KERNEL(full,
 PT_REGISTER_CTX_KERNEL(full_like,
                        CPU,
                        ALL_LAYOUT,
-                       pten::FullLike,
+                       pten::FullLikeKernel,
                        float,
                        double,
                        int,
diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc
deleted file mode 100644
index 87c4078896a18..0000000000000
--- a/paddle/pten/kernels/cpu/linalg.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/pten/kernels/cpu/linalg.h"
-
-#include "paddle/pten/core/kernel_registry.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/complex.h"
-
-#include "paddle/pten/kernels/hybird/math/matmul_func.h"
-
-namespace pten {
-
-template <typename T>
-void Dot(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out) {
-  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
-  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>();
-
-  // Loop over the total N elements of both operands while sum-reducing every
-  // B pairs along the way where B is the dimension of the least ordered axis
-  auto&& d = x.dims();
-  auto const N = x.numel();
-  auto const B = d[d.size() - 1];
-
-  for (int j = 0; j < N / B; j++) {
-    T ss = 0;
-    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
-    z[j] = ss;
-  }
-}
-
-template <typename T>
-void Matmul(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            bool transpose_x,
-            bool transpose_y,
-            DenseTensor* out) {
-  PADDLE_ENFORCE_NE(paddle::framework::product(x.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(X) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(paddle::framework::product(y.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(Y) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
-  math::MatMulFunction<CPUContext, T>(
-      dev_ctx, x, y, out, transpose_x, transpose_y);
-}
-
-}  // namespace pten
-
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-
-PT_REGISTER_KERNEL(dot,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Dot,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-
-PT_REGISTER_KERNEL(matmul,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Matmul,
-                   float,
-                   double,
-                   complex64,
-                   complex128) {}
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
deleted file mode 100644
index 32bc8e4e35d7b..0000000000000
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/pten/kernels/cpu/manipulation.h"
-#include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/cpu/utils.h"
-#include "paddle/pten/kernels/hybird/general/manipulation.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
-
-namespace pten {
-
-template <typename T>
-void Flatten(const CPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out) {
-  auto out_dims = out->dims();
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_dims);
-}
-
-// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
-// Output Tensor，
-// is there a more flexible way to deal with this case?
-template <typename T>
-void FlattenWithXShape(const CPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       int start_axis,
-                       int stop_axis,
-                       DenseTensor* out,
-                       DenseTensor* xshape) {
-  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
-  general::SetXShape(x, xshape);
-}
-
-void Reshape(const CPUContext& dev_ctx,
-             const DenseTensor& x,
-             const ScalarArray& shape,
-             DenseTensor* out) {
-  auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData());
-  if (x.data() == out->data() && x.numel() == out->numel()) {
-    out->Resize(out_meta.dims);
-    return;
-  }
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_meta.dims);
-  out->ResetLoD(x.lod());
-}
-
-void ReshapeWithXShape(const CPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       const ScalarArray& shape,
-                       DenseTensor* xshape,
-                       DenseTensor* out) {
-  general::SetXShape(x, xshape);
-  Reshape(dev_ctx, x, shape, out);
-}
-
-template <typename T>
-void Cast(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out) {
-  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                       math::CastKernelImpl<CPUContext, T, data_t>(
-                           dev_ctx, x, out);
-                     }));
-}
-
-}  // namespace pten
-
-PT_REGISTER_KERNEL(flatten,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Flatten,
-                   float,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-PT_REGISTER_KERNEL(flatten_with_xshape,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::FlattenWithXShape,
-                   float,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-
-PT_REGISTER_KERNEL(cast,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Cast,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   int16_t,
-                   bool,
-                   uint8_t,
-                   paddle::platform::float16,
-                   paddle::platform::bfloat16,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
-
-PT_REGISTER_NO_TEMPLATE_KERNEL(
-    reshape, CPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
-PT_REGISTER_NO_TEMPLATE_KERNEL(
-    reshape_with_xshape, CPU, ALL_LAYOUT, pten::ReshapeWithXShape, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
deleted file mode 100644
index 861ecf2829feb..0000000000000
--- a/paddle/pten/kernels/cpu/math.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/pten/kernels/cpu/math.h"
-
-#include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
-#include "paddle/pten/kernels/hybird/eigen/reduce.h"
-#include "paddle/pten/kernels/hybird/eigen/sign.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-
-namespace pten {
-
-template <typename T>
-void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  eigen::Sign<CPUContext, T>(dev_ctx, x, out);
-}
-
-template <typename T>
-void Mean(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  pten::general::Reduce<CPUContext, T, pten::eigen::MeanFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-template <typename T>
-void Divide(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out) {
-  // allocate memory for out
-  out->mutable_data<T>();
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<general::SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      ElementwiseCompute<general::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, general::DivideFunctor<T>(), out);
-    } else {
-      ElementwiseCompute<general::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, general::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
-template <typename T>
-void Sum(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out) {
-  pten::general::Reduce<CPUContext, T, pten::eigen::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
-
-}  // namespace pten
-
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-
-// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
-// using bfloat16 = ::paddle::platform::bfloat16;
-PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, pten::Sign, float, double) {}
-PT_REGISTER_KERNEL(mean, CPU, ALL_LAYOUT, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL(add,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Add,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(subtract,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Subtract,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(divide,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Divide,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(multiply,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Multiply,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(sum,
-                   CPU,
-                   ALL_LAYOUT,
-                   pten::Sum,
-                   bool,
-                   float,
-                   double,
-                   paddle::platform::float16,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
deleted file mode 100644
index 61e361d37ab3d..0000000000000
--- a/paddle/pten/kernels/cpu/math.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/backends/cpu/cpu_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace pten {
-
-template <typename T>
-void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
-
-template <typename T>
-void Mean(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out);
-
-template <typename T>
-void Add(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         int axis,
-         DenseTensor* out);
-
-template <typename T>
-void Subtract(const CPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Divide(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out);
-
-template <typename T>
-void Multiply(const CPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-template <typename T>
-void Sum(const CPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out);
-
-}  // namespace pten
-
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                      \
-  template <typename T>                                                      \
-  void name(const CPUContext& dev_ctx,                                       \
-            const DenseTensor& x,                                            \
-            const DenseTensor& y,                                            \
-            int axis,                                                        \
-            DenseTensor* out) {                                              \
-    out->mutable_data<T>();                                                  \
-    if (x.dims() == y.dims()) {                                              \
-      SameDimsElementwiseCompute<                                            \
-          general::SameDims##name##Functor<CPUContext, T>>()(                \
-          dev_ctx, x, y, out);                                               \
-    } else {                                                                 \
-      auto x_dims = x.dims();                                                \
-      auto y_dims = y.dims();                                                \
-      if (x_dims.size() >= y_dims.size()) {                                  \
-        ElementwiseCompute<general::name##Functor<T>, T>(                    \
-            dev_ctx, x, y, axis, general::name##Functor<T>(), out);          \
-      } else {                                                               \
-        ElementwiseCompute<general::Inverse##name##Functor<T>, T>(           \
-            dev_ctx, x, y, axis, general::Inverse##name##Functor<T>(), out); \
-      }                                                                      \
-    }                                                                        \
-  }
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
new file mode 100644
index 0000000000000..be0d52355bce6
--- /dev/null
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -0,0 +1,177 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/math_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/cpu/elementwise.h"
+#include "paddle/pten/kernels/cpu/reduce.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/funcs/reduce_functor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+
+namespace pten {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(const Context& dev_ctx,                                 \
+                    const DenseTensor& x,                                   \
+                    const DenseTensor& y,                                   \
+                    int axis,                                               \
+                    DenseTensor* out) {                                     \
+    out->mutable_data<T>();                                                 \
+    if (x.dims() == y.dims()) {                                             \
+      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
+          dev_ctx, x, y, out);                                              \
+    } else {                                                                \
+      auto x_dims = x.dims();                                               \
+      auto y_dims = y.dims();                                               \
+      if (x_dims.size() >= y_dims.size()) {                                 \
+        ElementwiseCompute<funcs::name##Functor<T>, T>(                     \
+            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+      } else {                                                              \
+        ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(            \
+            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+      }                                                                     \
+    }                                                                       \
+  }
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                bool reduce_all,
+                DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  pten::Reduce<CPUContext, T, pten::funcs::MeanFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out) {
+  // allocate memory for out
+  out->mutable_data<T>();
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
+    } else {
+      ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               bool reduce_all,
+               DataType out_dtype,
+               DenseTensor* out) {
+  pten::Reduce<CPUContext, T, pten::funcs::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+}  // namespace pten
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::paddle::platform::bfloat16;
+PT_REGISTER_CTX_KERNEL(
+    mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
+PT_REGISTER_CTX_KERNEL(add,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::AddKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(subtract,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::SubtractKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(divide,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::DivideKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(multiply,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::MultiplyKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       bool,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(sum,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::SumKernel,
+                       bool,
+                       float,
+                       double,
+                       paddle::platform::float16,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/pten/kernels/cpu/matmul_kernel.cc b/paddle/pten/kernels/cpu/matmul_kernel.cc
new file mode 100644
index 0000000000000..edba402ec1d84
--- /dev/null
+++ b/paddle/pten/kernels/cpu/matmul_kernel.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/matmul_kernel.h"
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
+
+PT_REGISTER_CTX_KERNEL(matmul,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::MatmulKernel,
+                       float,
+                       double,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/hybird/eigen/reduce.h b/paddle/pten/kernels/cpu/reduce.h
similarity index 78%
rename from paddle/pten/kernels/hybird/eigen/reduce.h
rename to paddle/pten/kernels/cpu/reduce.h
index d60a416dfdb37..fa603b2163055 100644
--- a/paddle/pten/kernels/hybird/eigen/reduce.h
+++ b/paddle/pten/kernels/cpu/reduce.h
@@ -14,16 +14,19 @@
 
 #pragma once
 
+#include <set>
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/kernels/cast_kernel.h"
+
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/kernels/hybird/eigen/common.h"
 #include "paddle/pten/kernels/hybird/transpose.h"
-
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-
 namespace pten {
-namespace eigen {
 
 template <typename DeviceContext,
           typename T,
@@ -194,21 +197,53 @@ void ReduceKernelImpl(const DeviceContext& dev_ctx,
   }
 }
 
-//////// Sum Functor ///////
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->sum(dim);
+template <typename DeviceContext, typename T, typename Functor>
+void Reduce(const DeviceContext& dev_ctx,
+            const DenseTensor& x,
+            bool reduce_all,
+            const std::vector<int64_t>& dims,
+            bool keep_dim,
+            DataType out_dtype,
+            DenseTensor* out) {
+  // If the dims has full dim, set the reduce_all is True
+  const int& input_dim_size = x.dims().size();
+  std::set<int> dims_set(dims.begin(), dims.end());
+  bool full_dim = true;
+  for (int i = 0; i < input_dim_size; ++i) {
+    if (dims_set.find(i) == dims_set.end() &&
+        dims_set.find(i - input_dim_size) == dims_set.end()) {
+      full_dim = false;
+      break;
+    }
   }
-};
+  reduce_all = (reduce_all || full_dim);
 
-//////// Mean Functor ///////
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->mean(dim);
+  // no need to cast dtype
+  if (out_dtype == pten::DataType::UNDEFINED || out_dtype == x.dtype()) {
+    if (out_dtype == pten::DataType::UNDEFINED) {
+      out_dtype = x.dtype();
+    }
+    // do reduce sum
+    PD_VISIT_ALL_TYPES(
+        out_dtype, "ReduceKernelImpl", ([&] {
+          pten::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
+              dev_ctx, x, out, dims, keep_dim, reduce_all);
+        }));
+  } else {
+    pten::DenseTensor tmp_tensor = pten::DenseTensor(
+        pten::make_intrusive<paddle::experimental::SharedStorage>(x.place()),
+        pten::DenseTensorMeta(out_dtype, x.dims(), x.layout()));
+
+    // cast x tensor to out_dtype
+    pten::CastKernel<T, DeviceContext>(dev_ctx, x, out_dtype, &tmp_tensor);
+
+    // do reduce sum
+    PD_VISIT_ALL_TYPES(
+        out_dtype, "ReduceKernelImpl", ([&] {
+          pten::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
+              dev_ctx, tmp_tensor, out, dims, keep_dim, reduce_all);
+        }));
   }
-};
+}
 
-}  // namespace eigen
 }  // namespace pten
diff --git a/paddle/pten/kernels/xpu/utils.h b/paddle/pten/kernels/cpu/sign_kernel.cc
similarity index 67%
rename from paddle/pten/kernels/xpu/utils.h
rename to paddle/pten/kernels/cpu/sign_kernel.cc
index 6e34502eb23a5..c6e352f7da44a 100644
--- a/paddle/pten/kernels/xpu/utils.h
+++ b/paddle/pten/kernels/cpu/sign_kernel.cc
@@ -12,24 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#include "paddle/pten/kernels/sign_kernel.h"
+#include "paddle/pten/kernels/impl/sign_kernel_impl.h"
 
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
-namespace pten {
-
-using XPUDeviceContext = paddle::platform::XPUDeviceContext;
-
-void Copy(const XPUDeviceContext& dev_ctx,
-          const DenseTensor& src,
-          bool blocking,
-          DenseTensor* dst);
-
-}  // namespace pten
+#include "paddle/fluid/platform/bfloat16.h"
 
-#endif
+PT_REGISTER_CTX_KERNEL(sign, CPU, ALL_LAYOUT, pten::Sign, float, double) {}
diff --git a/paddle/pten/kernels/gpu/linalg.h b/paddle/pten/kernels/dot_kernel.h
similarity index 65%
rename from paddle/pten/kernels/gpu/linalg.h
rename to paddle/pten/kernels/dot_kernel.h
index a848f55c7b9f0..9924749cd2141 100644
--- a/paddle/pten/kernels/gpu/linalg.h
+++ b/paddle/pten/kernels/dot_kernel.h
@@ -14,28 +14,14 @@
 
 #pragma once
 
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
 
-template <typename T>
-void Dot(const GPUContext& dev_ctx,
+template <typename T, typename Context>
+void Dot(const Context& dev_ctx,
          const DenseTensor& x,
          const DenseTensor& y,
          DenseTensor* out);
 
-template <typename T>
-void Matmul(const GPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            bool transpose_x,
-            bool transpose_y,
-            DenseTensor* out);
-
 }  // namespace pten
-
-#endif
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
new file mode 100644
index 0000000000000..94886806bccf3
--- /dev/null
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/empty_kernel.h"
+
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+void EmptyKernel(const ContextT& dev_ctx,
+                 const ScalarArray& shape,
+                 DenseTensor* out) {
+  out->Resize(paddle::framework::make_ddim(shape.GetData()));
+}
+
+template <typename T, typename ContextT>
+void EmptyLikeKernel(const ContextT& dev_ctx, DenseTensor* out) {
+  out->mutable_data<T>();
+}
+
+}  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(empty,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::EmptyKernel,
+                       bool,
+                       int,
+                       int64_t,
+                       float,
+                       double,
+                       paddle::platform::float16) {}
+
+PT_REGISTER_CTX_KERNEL(empty_like,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::EmptyLikeKernel,
+                       bool,
+                       int,
+                       int64_t,
+                       float,
+                       double,
+                       paddle::platform::float16) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_CTX_KERNEL(empty,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::EmptyKernel,
+                       bool,
+                       int,
+                       int64_t,
+                       float,
+                       double,
+                       paddle::platform::float16) {}
+
+PT_REGISTER_CTX_KERNEL(empty_like,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::EmptyLikeKernel,
+                       bool,
+                       int,
+                       int64_t,
+                       float,
+                       double,
+                       paddle::platform::float16) {}
+#endif
diff --git a/paddle/pten/kernels/empty_kernel.h b/paddle/pten/kernels/empty_kernel.h
new file mode 100644
index 0000000000000..d71ee0b1266f2
--- /dev/null
+++ b/paddle/pten/kernels/empty_kernel.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/common/scalar_array.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/infermeta/nullary.h"
+#include "paddle/pten/infermeta/unary.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void EmptyKernel(const Context& dev_ctx,
+                 const ScalarArray& shape,
+                 DenseTensor* out);
+
+template <typename T, typename Context>
+void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out);
+
+// TODO(chenweihang): the tensor creation method need to be replaced later,
+// all kernel api call Empty here instead of making tensor self
+template <typename T, typename Context>
+DenseTensor Empty(const Context& dev_ctx, DenseTensorMeta&& meta) {
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(meta));
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Empty(const Context& dev_ctx,
+                  const ScalarArray& shape,
+                  DataType dtype = DataType::FLOAT32,
+                  Backend backend = Backend::CPU,  // Is backend needed here?
+                  DataLayout layout = DataLayout::NCHW) {
+  auto out_meta = CreateInferMeta(shape, dtype, layout);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  EmptyKernel<T, Context>(dev_ctx, shape, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor EmptyLike(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    DataType dtype = DataType::UNDEFINED,
+    Backend backend = Backend::UNDEFINED,  // Is backend needed here?
+    DataLayout layout = DataLayout::UNDEFINED) {
+  auto out_meta = CreateLikeInferMeta(x.meta(), dtype, layout);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  EmptyLikeKernel<T, Context>(dev_ctx, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc
new file mode 100644
index 0000000000000..d6aea31748d6c
--- /dev/null
+++ b/paddle/pten/kernels/flatten_grad_kernel.cc
@@ -0,0 +1,73 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/flatten_grad_kernel.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void FlattenGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out_grad,
+                       const DenseTensor& xshape,
+                       DenseTensor* x_grad) {
+  auto xshape_dims = xshape.dims();
+  auto x_dims =
+      paddle::framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  pten::Copy(dev_ctx, out_grad, false, x_grad);
+  x_grad->Resize(x_dims);
+}
+
+}  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(flatten_grad,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::FlattenGradKernel,
+                       float,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_CTX_KERNEL(flatten_grad,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::FlattenGradKernel,
+                       float,
+                       paddle::platform::float16,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_REGISTER_CTX_KERNEL(flatten_grad,
+                       XPU,
+                       ALL_LAYOUT,
+                       pten::FlattenGradKernel,
+                       float,
+                       paddle::platform::float16,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+#endif
diff --git a/paddle/pten/kernels/gpu/conj_kernel.h b/paddle/pten/kernels/flatten_grad_kernel.h
similarity index 66%
rename from paddle/pten/kernels/gpu/conj_kernel.h
rename to paddle/pten/kernels/flatten_grad_kernel.h
index 7541f9290d246..91d9aa7c30609 100644
--- a/paddle/pten/kernels/gpu/conj_kernel.h
+++ b/paddle/pten/kernels/flatten_grad_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,17 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
 
-template <typename T>
-void Conj(const GPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+template <typename T, typename Context>
+void FlattenGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out_grad,
+                       const DenseTensor& xshape,
+                       DenseTensor* x_grad);
 
 }  // namespace pten
-
-#endif
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
new file mode 100644
index 0000000000000..b284d3690830f
--- /dev/null
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -0,0 +1,119 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/flatten_kernel.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+#include "paddle/pten/kernels/funcs/common_shape.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void FlattenKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int start_axis,
+                   int stop_axis,
+                   DenseTensor* out) {
+  auto out_dims = out->dims();
+  pten::Copy(dev_ctx, x, false, out);
+  out->Resize(out_dims);
+}
+
+// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
+// Output Tensor，
+// is there a more flexible way to deal with this case?
+template <typename T, typename Context>
+void FlattenWithXShape(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, out);
+  funcs::SetXShape(x, xshape);
+}
+
+}  // namespace pten
+
+PT_REGISTER_CTX_KERNEL(flatten,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::FlattenKernel,
+                       float,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
+                       CPU,
+                       ALL_LAYOUT,
+                       pten::FlattenWithXShape,
+                       float,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_CTX_KERNEL(flatten,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::FlattenKernel,
+                       float,
+                       paddle::platform::float16,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::FlattenWithXShape,
+                       float,
+                       paddle::platform::float16,
+                       double,
+                       uint8_t,
+                       int8_t,
+                       int,
+                       int64_t) {}
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_REGISTER_CTX_KERNEL(flatten,
+                       XPU,
+                       ALL_LAYOUT,
+                       pten::FlattenKernel,
+                       float,
+                       paddle::platform::float16,
+                       int8_t,
+                       int,
+                       int64_t) {}
+
+PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
+                       XPU,
+                       ALL_LAYOUT,
+                       pten::FlattenWithXShape,
+                       float,
+                       paddle::platform::float16,
+                       int8_t,
+                       int,
+                       int64_t) {}
+#endif
diff --git a/paddle/pten/kernels/flatten_kernel.h b/paddle/pten/kernels/flatten_kernel.h
new file mode 100644
index 0000000000000..a67e66fac4130
--- /dev/null
+++ b/paddle/pten/kernels/flatten_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void FlattenKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   int start_axis,
+                   int stop_axis,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void FlattenWithXShape(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape);
+
+template <typename T, typename Context>
+DenseTensor Flatten(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    int start_axis,
+                    int stop_axis) {
+  auto out_meta = FlattenInferMeta(x.meta(), start_axis, stop_axis);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  FlattenKernel<T, Context>(dev_ctx, x, start_axis, stop_axis, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/full_kernel.h b/paddle/pten/kernels/full_kernel.h
index f8abb9436679b..bc484fb4edffa 100644
--- a/paddle/pten/kernels/full_kernel.h
+++ b/paddle/pten/kernels/full_kernel.h
@@ -14,20 +14,51 @@
 
 #pragma once
 
-#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
 
+#include "paddle/pten/infermeta/nullary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+
 namespace pten {
 
-template <typename T, typename ContextT>
-void Full(const ContextT& dev_ctx,
-          const ScalarArray& shape,
-          const Scalar& val,
-          DenseTensor* out);
+template <typename T, typename Context>
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void FullLikeKernel(const Context& dev_ctx,
+                    const Scalar& val,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Full(const Context& dev_ctx,
+                 const ScalarArray& shape,
+                 const Scalar& val,
+                 DataType dtype = DataType::FLOAT32,
+                 Backend backend = Backend::CPU,  // Is backend needed here?
+                 DataLayout layout = DataLayout::NCHW) {
+  auto out_meta = CreateInferMeta(shape, dtype, layout);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  FullKernel<T, Context>(dev_ctx, shape, val, &dense_out);
+  return dense_out;
+}
 
-template <typename T, typename ContextT>
-void FullLike(const ContextT& dev_ctx, const Scalar& val, DenseTensor* out);
+template <typename T, typename Context>
+DenseTensor FullLike(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const Scalar& val,
+    DataType dtype = DataType::UNDEFINED,
+    Backend backend = Backend::UNDEFINED,  // Is backend needed here?
+    DataLayout layout = DataLayout::UNDEFINED) {
+  auto out_meta = CreateLikeInferMeta(x.meta(), dtype, layout);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  FullLikeKernel<T, Context>(dev_ctx, val, &dense_out);
+  return dense_out;
+}
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/hybird/general/manipulation.h b/paddle/pten/kernels/funcs/common_shape.h
similarity index 96%
rename from paddle/pten/kernels/hybird/general/manipulation.h
rename to paddle/pten/kernels/funcs/common_shape.h
index 85f6b613ac609..f0678e4706e04 100644
--- a/paddle/pten/kernels/hybird/general/manipulation.h
+++ b/paddle/pten/kernels/funcs/common_shape.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
-namespace general {
+namespace funcs {
 
 inline void SetXShape(const DenseTensor& x, DenseTensor* xshape) {
   const auto& in_dims = x.meta().dims;
@@ -30,5 +30,5 @@ inline void SetXShape(const DenseTensor& x, DenseTensor* xshape) {
   xshape->ResetLoD(x.meta().lod);
 }
 
-}  // namespace general
+}  // namespace funcs
 }  // namespace pten
diff --git a/paddle/pten/kernels/funcs/cuda_kernel_config.h b/paddle/pten/kernels/funcs/cuda_kernel_config.h
new file mode 100644
index 0000000000000..27fbc1de55a35
--- /dev/null
+++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+#ifdef __HIPCC__
+#define ELEMENTWISE_BLOCK_SIZE 256
+#else
+#define ELEMENTWISE_BLOCK_SIZE 512
+#endif
+
+namespace pten {
+namespace funcs {
+/*
+* According to NVIDIA, if number of threads per block is 64/128/256/512,
+* cuda performs better. And number of blocks should be greater (at least
+* 2x~4x) than number of SMs. Hence, SM count is took into account within
+* this function to determine the right number of threads per block.
+*/
+inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx,
+                            int64_t numel,
+                            int vec_size) {
+  int threads = ELEMENTWISE_BLOCK_SIZE;
+  int sm_count = ctx.GetSMCount();
+  int active_threads_num = numel / vec_size;
+  if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about twice of SM, to acquire better performance.
+    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
+                                                  (sm_count << 1));
+  } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) {
+    // Round up threads number into an exponential multiple of 2, while number
+    // of acitve blocks is about 4 times of SM, to acquire better performance.
+    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
+                                                  (sm_count << 2));
+  }
+  // Number of threads per block shall be larger than 64.
+  return std::max(64, threads);
+}
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/general/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
similarity index 99%
rename from paddle/pten/kernels/hybird/general/elementwise_base.h
rename to paddle/pten/kernels/funcs/elementwise_base.h
index 20154a8744f3d..a0c6d5ba57011 100644
--- a/paddle/pten/kernels/hybird/general/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
-namespace general {
+namespace funcs {
 
 using DDim = paddle::framework::DDim;
 
@@ -378,6 +378,5 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
     }
   }
 }
-
-}  // namespace general
+}  // namespace funcs
 }  // namespace pten
diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h
new file mode 100644
index 0000000000000..9b2519b0fd6b1
--- /dev/null
+++ b/paddle/pten/kernels/funcs/elementwise_functor.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace pten {
+namespace funcs {
+
+// Define the binary functors used in elementwise ops.
+
+// Add
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
+};
+template <typename T>
+struct InverseAddFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
+};
+
+// Subtract
+template <typename T>
+struct SubtractFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+};
+template <typename T>
+struct InverseSubtractFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
+};
+
+// Multiply
+template <typename T>
+struct MultiplyFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+template <typename T>
+struct InverseMultiplyFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
+};
+
+// Divide
+#define DIV_ERROR_INFO                                             \
+  "InvalidArgumentError: Integer division by zero encountered in " \
+  "(floor) divide. Please check the input value."
+
+template <typename T, typename Enable = void>
+struct DivideFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+};
+
+template <typename T>
+struct DivideFunctor<
+    T,
+    typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
+    return a / b;
+  }
+};
+
+template <typename T, typename Enable = void>
+struct InverseDivideFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
+};
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/reduce_functor.h b/paddle/pten/kernels/funcs/reduce_functor.h
new file mode 100644
index 0000000000000..64ada0231892e
--- /dev/null
+++ b/paddle/pten/kernels/funcs/reduce_functor.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pten {
+namespace funcs {
+
+//////// Sum Functor ///////
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+
+//////// Mean Functor ///////
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
+  }
+};
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt
index 11ff1608b814c..e69de29bb2d1d 100644
--- a/paddle/pten/kernels/gpu/CMakeLists.txt
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
@@ -1,17 +0,0 @@
-if(WITH_GPU)
-  nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
-  nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  nv_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-  nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
-  nv_library(scale_kernel_gpu SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  nv_library(full_kernel_gpu SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  nv_library(conj_kernel_gpu SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
-elseif(WITH_ROCM)
-  hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
-  hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  hip_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-  hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
-  hip_library(scale_kernel_gpu SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  hip_library(full_kernel_gpu SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  hip_library(conj_kernel_gpu SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
-endif()
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
new file mode 100644
index 0000000000000..9f65400f93b9f
--- /dev/null
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -0,0 +1,87 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/cast_kernel.h"
+
+#include "paddle/pten/api/ext/dispatch.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace pten {
+
+template <typename InT, typename OutT>
+struct CastFuctor {
+  __device__ __forceinline__ OutT operator()(const InT& x) const {
+    return static_cast<OutT>(x);
+  }
+};
+
+template <typename InT, typename OutT>
+void CastCUDAKernelImpl(const GPUContext& dev_ctx,
+                        const DenseTensor& x,
+                        DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs;
+  std::vector<DenseTensor*> outputs;
+  inputs.emplace_back(&x);
+  outputs.emplace_back(out);
+  out->mutable_data<OutT>();
+  LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, InT, OutT>(
+      dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
+}
+
+template <typename T, typename Context>
+void CastKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DataType out_dtype,
+                DenseTensor* out) {
+  PD_VISIT_ALL_TYPES(out_dtype, "CastCUDAKernelImpl", ([&] {
+                       CastCUDAKernelImpl<T, data_t>(dev_ctx, x, out);
+                     }));
+}
+
+}  // namespace pten
+
+#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...)     \
+  PT_REGISTER_CTX_KERNEL(cast,                              \
+                         GPU,                               \
+                         ALL_LAYOUT,                        \
+                         pten::CastKernel,                  \
+                         float,                             \
+                         double,                            \
+                         int,                               \
+                         int64_t,                           \
+                         int16_t,                           \
+                         bool,                              \
+                         uint8_t,                           \
+                         paddle::platform::float16,         \
+                         paddle::platform::complex<float>,  \
+                         paddle::platform::complex<double>, \
+                         ##__VA_ARGS__) {                   \
+    kernel->OutputAt(0).SetDataType(                        \
+        paddle::experimental::DataType::UNDEFINED);         \
+  }
+
+#if !defined(PADDLE_WITH_HIP)
+PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
+#else
+PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
+#endif
diff --git a/paddle/pten/kernels/gpu/conj_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu
similarity index 53%
rename from paddle/pten/kernels/gpu/conj_kernel.cu
rename to paddle/pten/kernels/gpu/complex_kernel.cu
index cb4fef883fdac..5a3c14de4036a 100644
--- a/paddle/pten/kernels/gpu/conj_kernel.cu
+++ b/paddle/pten/kernels/gpu/complex_kernel.cu
@@ -12,28 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/kernels/gpu/conj_kernel.h"
+#include "paddle/pten/kernels/complex_kernel.h"
+#include "paddle/pten/kernels/impl/complex_kernel_impl.h"
 
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/hybird/math/conj_impl.h"
 
-namespace pten {
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/complex.h"
 
-template <typename T>
-void Conj(const GPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  ConjImpl<T, GPUContext>(dev_ctx, x, out);
-}
-
-}  // namespace pten
-
-PT_REGISTER_KERNEL(conj,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Conj,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>,
-                   float,
-                   double,
-                   int,
-                   int64_t) {}
+PT_REGISTER_CTX_KERNEL(conj,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::Conj,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>,
+                       float,
+                       double,
+                       int,
+                       int64_t) {}
diff --git a/paddle/pten/kernels/gpu/utils.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
similarity index 97%
rename from paddle/pten/kernels/gpu/utils.cu
rename to paddle/pten/kernels/gpu/copy_kernel.cu
index 4d080be11e3ed..877a06ce33e5d 100644
--- a/paddle/pten/kernels/gpu/utils.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -12,15 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/gpu/utils.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace pten {
 
-void Copy(const GPUContext& dev_ctx,
+template <typename Context>
+void Copy(const Context& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst) {
@@ -232,6 +237,8 @@ void Copy(const GPUContext& dev_ctx,
     }
   }
 }
+
 }  // namespace pten
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, GPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, GPU, ALL_LAYOUT, pten::Copy<pten::GPUContext>, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
new file mode 100644
index 0000000000000..6b66d45b7dd48
--- /dev/null
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/dot_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/complex.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void Dot(const Context& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  out->mutable_data<T>();
+  if (1 == out->dims().size()) {
+    auto eigen_out = pten::EigenScalar<T>::From(*out);
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
+  } else {
+    auto eigen_out = pten::EigenMatrix<T>::From(*out);
+    auto eigen_x = pten::EigenMatrix<T>::From(x);
+    auto eigen_y = pten::EigenMatrix<T>::From(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
+  }
+}
+
+}  // namespace pten
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_CTX_KERNEL(dot,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::Dot,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {}
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
new file mode 100644
index 0000000000000..f78328c01a30d
--- /dev/null
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -0,0 +1,863 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/function_traits.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
+
+namespace pten {
+
+namespace kps = paddle::operators::kernel_primitives;
+enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 };
+
+/* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
+   for supporting multiple-output feature in elementwise system.*/
+template <class T, int Num>
+using ConditionalT =
+    typename std::conditional_t<Num == 1, T, paddle::framework::Array<T, Num>>;
+
+template <typename InT,
+          typename OutT,
+          int VecSize,
+          typename Functor,
+          int Arity,
+          bool CallElementwiseAny = false>
+struct ElementwisePrimitiveCaller {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result);
+};
+
+template <typename InT, typename OutT, int VecSize, typename Functor, int Arity>
+struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result) {
+    kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(
+        result, args, func);
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, typename Functor>
+struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result) {
+    kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(
+        result, args[0], func);
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, typename Functor>
+struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result) {
+    kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
+        result, args[0], args[1], func);
+  }
+};
+
+template <typename InT, typename OutT, int VecSize, typename Functor>
+struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
+  __device__ inline void operator()(Functor func,
+                                    InT (*args)[VecSize],
+                                    OutT *result) {
+    kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
+        result, args[0], args[1], args[2], func);
+  }
+};
+
+template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
+struct ElementwiseWriteDataCaller {
+  __device__ __forceinline__ void operator()(
+      paddle::framework::Array<OutT *, NumOuts> outs,
+      ConditionalT<OutT, NumOuts> src[VecSize],
+      int block_offset,
+      int num) {
+    OutT dst[NumOuts][VecSize];
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+#pragma unroll
+      for (int j = 0; j < NumOuts; ++j) {
+        dst[j][i] = (src[i])[j];
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < NumOuts; ++i) {
+      kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+          outs[i] + block_offset, dst[i], num);
+    }
+  }
+};
+
+template <typename OutT, int VecSize, bool IsBoundary>
+struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
+  __device__ __forceinline__ void operator()(
+      paddle::framework::Array<OutT *, 1> outs,
+      OutT src[VecSize],
+      int block_offset,
+      int num) {
+    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+        outs[0] + block_offset, src, num);
+  }
+};
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize,
+          bool IsBoundary>
+__device__ void VectorizedElementwiseKernelImpl(
+    const paddle::framework::Array<const InT *__restrict__, Arity> &in,
+    paddle::framework::Array<OutT *, NumOuts> outs,
+    int num,
+    int data_offset,
+    Functor func) {
+  InT args[Arity][VecSize];
+  ConditionalT<OutT, NumOuts> result[VecSize];
+
+#pragma unroll
+  for (int i = 0; i < Arity; i++) {
+    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
+    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(
+        args[i], in[i] + data_offset, num);
+  }
+
+  constexpr bool kCallElementwiseAny =
+      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
+  ElementwisePrimitiveCaller<InT,
+                             ConditionalT<OutT, NumOuts>,
+                             VecSize,
+                             Functor,
+                             Arity,
+                             kCallElementwiseAny>()(func, args, result);
+
+  ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
+      outs, result, data_offset, num);
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
+__global__ void VectorizedElementwiseKernel(
+    paddle::framework::Array<const InT *__restrict__, Arity> ins,
+    paddle::framework::Array<OutT *, NumOuts> outs,
+    int size,
+    int main_offset,
+    Functor func) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  for (; data_offset < main_offset; data_offset += stride) {
+    VectorizedElementwiseKernelImpl<InT,
+                                    OutT,
+                                    Functor,
+                                    Arity,
+                                    NumOuts,
+                                    VecSize,
+                                    false>(
+        ins, outs, VecSize * BLOCK_NUM_X, data_offset, func);
+  }
+
+  int num = size - data_offset;
+  if (num > 0) {
+    VectorizedElementwiseKernelImpl<InT,
+                                    OutT,
+                                    Functor,
+                                    Arity,
+                                    NumOuts,
+                                    VecSize,
+                                    true>(ins, outs, num, data_offset, func);
+  }
+}
+
+template <typename InT, typename OutT>
+int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
+                                const std::vector<DenseTensor *> &outs) {
+  int vec_size = 4;
+  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
+    vec_size = std::min<int>(
+        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<InT>()));
+  }
+  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
+    vec_size = std::min<int>(
+        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
+  }
+  return vec_size;
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
+void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
+                           const std::vector<const DenseTensor *> &ins,
+                           std::vector<DenseTensor *> *outs,
+                           Functor func) {
+  auto numel = ins[0]->numel();
+  int block_size = funcs::GetThreadsConfig(ctx, numel, VecSize);
+  int grid_size =
+      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
+  auto stream = ctx.stream();
+  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
+  paddle::framework::Array<OutT *, NumOuts> outs_data;
+
+  for (int i = 0; i < Arity; ++i) {
+    ins_data[i] = ins[i]->data<InT>();
+  }
+  for (int i = 0; i < NumOuts; ++i) {
+    outs_data[i] = (*outs)[i]->mutable_data<OutT>();
+  }
+#ifdef PADDLE_WITH_XPU2
+  block_size = 128;
+  grid_size = 8;
+  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  VectorizedElementwiseKernel<InT,
+                              OutT,
+                              Functor,
+                              Arity,
+                              NumOuts,
+                              VecSize><<<grid_size, block_size, 0, stream>>>(
+      ins_data, outs_data, numel, main_offset, func);
+#else
+  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  VectorizedElementwiseKernel<InT,
+                              OutT,
+                              Functor,
+                              Arity,
+                              NumOuts,
+                              VecSize><<<grid_size, block_size, 0, stream>>>(
+      ins_data, outs_data, numel, main_offset, func);
+#endif
+}
+
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
+void LaunchSameDimsElementwiseCudaKernel(
+    const paddle::platform::CUDADeviceContext &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    Functor func) {
+  using Traits = paddle::platform::FunctionTraits<Functor>;
+  const int kArity =
+      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
+  PADDLE_ENFORCE_EQ(ins.size(),
+                    kArity,
+                    paddle::platform::errors::InvalidArgument(
+                        "The number of inputs is expected to be equal to the "
+                        "arity of functor. But recieved: the number of inputs "
+                        "is %d, the arity of functor is %d.",
+                        ins.size(),
+                        kArity));
+  PADDLE_ENFORCE_EQ(outs->size(),
+                    NumOuts,
+                    paddle::platform::errors::InvalidArgument(
+                        "Number of outputs shall equal to number of functions, "
+                        "but number of outputs is %d, of functions is %d.",
+                        outs->size(),
+                        NumOuts));
+
+  if (NumOuts > 1) {
+    for (int i = 1; i < NumOuts; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*outs)[i]->dims(),
+          (*outs)[0]->dims(),
+          paddle::platform::errors::InvalidArgument(
+              "The shape of each output tensor shall be identical yet, "
+              "but %dth output tensor`s shape is not.",
+              i));
+    }
+  }
+
+  // calculate the max vec_size for all ins and outs
+  int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs);
+  switch (vec_size) {
+    case 4:
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 4>(
+          ctx, ins, outs, func);
+      break;
+    case 2:
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 2>(
+          ctx, ins, outs, func);
+      break;
+    case 1:
+      ElementwiseCudaKernel<InT, OutT, Functor, kArity, NumOuts, 1>(
+          ctx, ins, outs, func);
+      break;
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+struct DimensionsTransform {
+  using DimVector = std::vector<int64_t>;
+  typedef void (*MergeFunctor)(
+      bool &, std::vector<DimVector> &, DimVector &, int, int);
+  int64_t dim_size;
+  DimVector out_dims;
+  std::vector<DimVector> in_dims;
+
+ private:
+  // To compensate the lackage of input_tensors` dimension with input variable
+  // 'axis'
+  void InputDimensionsExtend(int N, int axis) {
+    for (auto &in_dim : in_dims) {
+      int64_t in_idx = 0;
+      if (in_dim.size() < dim_size) {
+        DimVector tmp_dim(dim_size, 1);
+        do {
+          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
+            tmp_dim[axis] = in_dim[in_idx];
+            in_idx++;
+            axis++;
+          } else {
+            PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+                "The %d-th dimension of input tensor is expected to be equal "
+                "with the %d-th dimension of output tensor %d or 1, but "
+                "recieved %d.",
+                in_idx + 1,
+                axis + 1,
+                out_dims[axis],
+                in_dim[in_idx]));
+          }
+        } while (in_idx < in_dim.size());
+        in_dim.resize(dim_size);
+        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
+      } else {
+        do {
+          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
+            in_idx++;
+          } else {
+            PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+                "The %d-th dimension of input tensor is expected to be equal "
+                "with the %d-th dimension of output tensor %d or 1, but "
+                "recieved %d.",
+                in_idx + 1,
+                in_idx + 1,
+                out_dims[in_idx],
+                in_dim[in_idx]));
+          }
+        } while (in_idx < dim_size);
+      }
+      std::reverse(in_dim.begin(), in_dim.end());
+    }
+    std::reverse(out_dims.begin(), out_dims.end());
+  }
+
+  template <typename MergeFunctor>
+  __inline__ void MergeDimensions(MergeFunctor merge_func, int N) {
+    auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) {
+      (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx,
+                                          vec->begin() + m_idx,
+                                          1,
+                                          std::multiplies<int64_t>());
+      vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1);
+    };
+
+    int64_t i = 0;
+    while (i < dim_size) {
+      int cnt = 0;
+      int low_idx = i;
+      bool equal = true;
+      do {
+        merge_func(equal, in_dims, out_dims, i, N);
+        if (equal) {
+          i++;
+          cnt++;
+        } else {
+          break;
+        }
+      } while (i < dim_size);
+
+      if (cnt > 1) {
+        for (auto &in_dim : in_dims) {
+          VectorReorganise(&in_dim, low_idx, i);
+        }
+        VectorReorganise(&out_dims, low_idx, i);
+        dim_size -= --cnt;
+        i -= cnt;
+      } else if (cnt < 1) {
+        i++;
+      }
+    }
+  }
+
+ public:
+  explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
+                               const paddle::framework::DDim &dims,
+                               int axis) {
+    const int N = ins.size();
+    dim_size = dims.size();
+    out_dims = paddle::framework::vectorize<int64_t>(dims);
+    in_dims.resize(N);
+    for (int j = 0; j < N; ++j) {
+      in_dims[j] = paddle::framework::vectorize<int64_t>(ins[j]->dims());
+    }
+    InputDimensionsExtend(N, axis);
+
+    auto merge_sequential_dims = [](bool &equal,
+                                    std::vector<DimVector> &in_dims,
+                                    DimVector &out,
+                                    int i,
+                                    int num) {
+      for (int j = 1; j < num; ++j) {
+        equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false;
+      }
+    };
+    auto merge_sequential_one_dims = [](bool &equal,
+                                        std::vector<DimVector> &in_dims,
+                                        DimVector &out,
+                                        int i,
+                                        int num) {
+      equal = in_dims[0][i] == 1;
+      if (equal) {
+        for (int j = 1; j < num; ++j) {
+          equal &= in_dims[j][i] == out[i];
+        }
+      }
+    };
+    // To Merge the dimensions of input_tensors while the consequtive
+    // equal-dimensions appears.
+    MergeFunctor merge_ptr = merge_sequential_dims;
+    MergeDimensions<MergeFunctor>(merge_ptr, N);
+
+    int min_idx = 0;
+    int min_val = std::accumulate(
+        in_dims[0].begin(), in_dims[0].end(), 1, std::multiplies<int64_t>());
+    for (int j = 1; j < N; ++j) {
+      int temp = std::accumulate(
+          in_dims[j].begin(), in_dims[j].end(), 1, std::multiplies<int64_t>());
+      min_val = min_val > temp ? temp : min_val;
+      min_idx = min_val == temp ? j : min_idx;
+    }
+    std::swap(in_dims[0], in_dims[min_idx]);
+
+    // To Merge the dimension of input_tensors while the consequtive
+    // 1-value-dimensions appears.
+    merge_ptr = merge_sequential_one_dims;
+    MergeDimensions<MergeFunctor>(merge_ptr, N);
+    std::swap(in_dims[min_idx], in_dims[0]);
+  }
+};
+
+template <typename T, int VecSize, int Rank, bool IsBoundary = false>
+__device__ __forceinline__ void LoadData(
+    T *dst,
+    const T *__restrict__ src,
+    uint32_t block_offset,
+    const kps::details::BroadcastConfig<Rank> &config,
+    int numel,
+    int num,
+    bool need_broadcast) {
+  // numel : whole num of output
+  // num: how many data will be deal with in this time
+  if (need_broadcast) {
+    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(
+        dst, src, block_offset, config, numel);
+  } else {
+    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
+  }
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize,
+          int Rank,
+          bool IsBoundary = false>
+__device__ void ElementwiseBroadcastKernelImpl(
+    const paddle::framework::Array<const InT *__restrict__, Arity> &ins,
+    paddle::framework::Array<OutT *, NumOuts> outs,
+    const paddle::framework::Array<bool, Arity> &use_broadcast,
+    uint32_t numel,
+    const paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
+        &configs,
+    int num,
+    int block_offset,
+    Functor func) {
+  InT args[Arity][VecSize];
+  ConditionalT<OutT, NumOuts> result[VecSize];
+
+#pragma unroll
+  for (int i = 0; i < Arity; i++) {
+    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
+    LoadData<InT, VecSize, Rank, IsBoundary>(args[i],
+                                             ins[i],
+                                             block_offset,
+                                             configs[i],
+                                             numel,
+                                             num,
+                                             use_broadcast[i]);
+  }
+  constexpr bool kCallElementwiseAny =
+      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
+  ElementwisePrimitiveCaller<InT,
+                             ConditionalT<OutT, NumOuts>,
+                             VecSize,
+                             Functor,
+                             Arity,
+                             kCallElementwiseAny>()(func, args, result);
+
+  ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
+      outs, result, block_offset, num);
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize,
+          int Rank>
+__global__ void ElementwiseBroadcastKernel(
+    paddle::framework::Array<const InT *__restrict__, Arity> ins,
+    paddle::framework::Array<OutT *, NumOuts> outs,
+    paddle::framework::Array<bool, Arity> use_broadcast,
+    uint32_t numel,
+    paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
+        configs,
+    int main_offset,
+    int tail_tid,
+    Functor func) {
+  int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+
+#ifdef PADDLE_WITH_XPU2
+  for (; block_offset < main_offset; block_offset += stride) {
+    ElementwiseBroadcastKernelImpl<InT,
+                                   OutT,
+                                   Functor,
+                                   Arity,
+                                   NumOuts,
+                                   VecSize,
+                                   Rank,
+                                   false>(ins,
+                                          outs,
+                                          use_broadcast,
+                                          numel,
+                                          configs,
+                                          BLOCK_NUM_X * VecSize,
+                                          block_offset,
+                                          func);
+  }
+  if (block_offset < numel) {
+    ElementwiseBroadcastKernelImpl<InT,
+                                   OutT,
+                                   Functor,
+                                   Arity,
+                                   NumOuts,
+                                   VecSize,
+                                   Rank,
+                                   true>(
+        ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
+  }
+#else
+  if (block_offset < main_offset) {
+    ElementwiseBroadcastKernelImpl<InT,
+                                   OutT,
+                                   Functor,
+                                   Arity,
+                                   NumOuts,
+                                   VecSize,
+                                   Rank,
+                                   false>(ins,
+                                          outs,
+                                          use_broadcast,
+                                          numel,
+                                          configs,
+                                          BLOCK_NUM_X * VecSize,
+                                          block_offset,
+                                          func);
+  } else {
+    ElementwiseBroadcastKernelImpl<InT,
+                                   OutT,
+                                   Functor,
+                                   Arity,
+                                   NumOuts,
+                                   VecSize,
+                                   Rank,
+                                   true>(
+        ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
+  }
+#endif
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize,
+          int Rank>
+void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
+                  const std::vector<const DenseTensor *> &ins,
+                  std::vector<DenseTensor *> *outs,
+                  Functor func,
+                  DimensionsTransform merge_dims) {
+  int numel = (*outs)[0]->numel();
+  const int threads = 256;
+  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
+
+  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  int tail_tid = numel % (VecSize * threads);
+  auto stream = ctx.stream();
+
+  paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
+  paddle::framework::Array<bool, Arity> use_broadcast;
+  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
+  paddle::framework::Array<OutT *, NumOuts> outs_data;
+
+  for (int i = 0; i < NumOuts; ++i) {
+    outs_data[i] = (*outs)[i]->mutable_data<OutT>();
+  }
+
+  for (int i = 0; i < Arity; i++) {
+    use_broadcast[i] = (ins[i]->numel() != numel);
+    ins_data[i] = ins[i]->data<InT>();
+    if (use_broadcast[i]) {
+      // get the broadcast config,
+      // if data shape is[m, n], then you should set data_dim = {n, m}
+      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
+      configs[i] = kps::details::BroadcastConfig<Rank>(
+          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
+    }
+  }
+
+#ifdef PADDLE_WITH_XPU2
+  threads = 128;
+  blocks = 8;
+  main_offset = (numel / (VecSize * threads)) * VecSize * threads;
+  tail_tid = numel % (VecSize * threads);
+  ElementwiseBroadcastKernel<InT,
+                             OutT,
+                             Functor,
+                             Arity,
+                             NumOuts,
+                             VecSize,
+                             Rank><<<blocks, threads, stream>>>(ins_data,
+                                                                outs_data,
+                                                                use_broadcast,
+                                                                numel,
+                                                                configs,
+                                                                main_offset,
+                                                                tail_tid,
+                                                                func);
+#else
+  ElementwiseBroadcastKernel<InT,
+                             OutT,
+                             Functor,
+                             Arity,
+                             NumOuts,
+                             VecSize,
+                             Rank><<<blocks, threads, 0, stream>>>(
+      ins_data,
+      outs_data,
+      use_broadcast,
+      numel,
+      configs,
+      main_offset,
+      tail_tid,
+      func);
+#endif
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
+void LaunchBroadcastKernelForDifferentVecSize(
+    const paddle::platform::CUDADeviceContext &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    int axis,
+    Functor func) {
+  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
+
+#define CALL_BROADCAST_FOR_DIM_SIZE(rank)                            \
+  case rank: {                                                       \
+    LaunchKernel<InT, OutT, Functor, Arity, NumOuts, VecSize, rank>( \
+        ctx, ins, outs, func, merge_dims);                           \
+  } break;
+
+  switch (merge_dims.dim_size) {
+    CALL_BROADCAST_FOR_DIM_SIZE(1);
+    CALL_BROADCAST_FOR_DIM_SIZE(2);
+    CALL_BROADCAST_FOR_DIM_SIZE(3);
+    CALL_BROADCAST_FOR_DIM_SIZE(4);
+    CALL_BROADCAST_FOR_DIM_SIZE(5);
+    CALL_BROADCAST_FOR_DIM_SIZE(6);
+    CALL_BROADCAST_FOR_DIM_SIZE(7);
+    CALL_BROADCAST_FOR_DIM_SIZE(8);
+    default: {
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "The maximum dimension of input tensor is expected to be less than "
+          "%d, but recieved %d.\n",
+          merge_dims.dim_size,
+          paddle::framework::DDim::kMaxRank));
+    }
+  }
+#undef CALL_BROADCAST_FOR_DIM_SIZE
+}
+
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
+void LaunchBroadcastElementwiseCudaKernel(
+    const paddle::platform::CUDADeviceContext &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    int axis,
+    Functor func) {
+  using Traits = paddle::platform::FunctionTraits<Functor>;
+  const int kArity =
+      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
+  PADDLE_ENFORCE_EQ(ins.size(),
+                    kArity,
+                    paddle::platform::errors::InvalidArgument(
+                        "The number of inputs is expected to be equal to the "
+                        "arity of functor. But recieved: the number of inputs "
+                        "is %d, the arity of functor is %d.",
+                        ins.size(),
+                        kArity));
+  PADDLE_ENFORCE_LE(kArity,
+                    3,
+                    paddle::platform::errors::InvalidArgument(
+                        "Currently only broadcast of ternary is supported "
+                        "and verified, but received %d.",
+                        kArity));
+  PADDLE_ENFORCE_EQ(outs->size(),
+                    NumOuts,
+                    paddle::platform::errors::InvalidArgument(
+                        "Number of outputs shall equal to number of functions, "
+                        "but number of outputs is %d, of functions is %d.",
+                        outs->size(),
+                        NumOuts));
+  int in_vec_size = 4;
+  int out_vec_size = 4;
+  if (NumOuts > 1) {
+    for (int i = 0; i < NumOuts; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*outs)[i]->dims(),
+          (*outs)[0]->dims(),
+          paddle::platform::errors::InvalidArgument(
+              "The shape of each output tensor shall be identical yet, but "
+              "%dth output tensor`s shape is not.",
+              i));
+      out_vec_size = std::min(
+          paddle::platform::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()),
+          out_vec_size);
+    }
+  } else {
+    out_vec_size =
+        paddle::platform::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+  }
+
+  for (auto *in : ins) {
+    auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
+    in_vec_size = in->dims() == (*outs)[0]->dims()
+                      ? std::min(temp_size, in_vec_size)
+                      : in_vec_size;
+  }
+  int vec_size = std::min(out_vec_size, in_vec_size);
+
+  switch (vec_size) {
+    case 4: {
+      LaunchBroadcastKernelForDifferentVecSize<InT,
+                                               OutT,
+                                               Functor,
+                                               kArity,
+                                               NumOuts,
+                                               4>(ctx, ins, outs, axis, func);
+      break;
+    }
+    case 2: {
+      LaunchBroadcastKernelForDifferentVecSize<InT,
+                                               OutT,
+                                               Functor,
+                                               kArity,
+                                               NumOuts,
+                                               2>(ctx, ins, outs, axis, func);
+      break;
+    }
+    case 1: {
+      LaunchBroadcastKernelForDifferentVecSize<InT,
+                                               OutT,
+                                               Functor,
+                                               kArity,
+                                               NumOuts,
+                                               1>(ctx, ins, outs, axis, func);
+      break;
+    }
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+template <ElementwiseType ET,
+          typename InT,
+          typename OutT,
+          typename Functor,
+          int NumOuts = 1>
+void LaunchElementwiseCudaKernel(
+    const paddle::platform::CUDADeviceContext &cuda_ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    int axis,
+    Functor func) {
+  std::vector<int> dims_size;
+  bool no_broadcast_flag = true;
+  for (auto *in : ins) {
+    no_broadcast_flag &= ins[0]->dims() == in->dims();
+    dims_size.emplace_back(in->dims().size());
+  }
+  if (no_broadcast_flag) {
+    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+        cuda_ctx, ins, outs, func);
+  } else {
+    axis = axis == -1
+               ? *std::max_element(dims_size.begin(), dims_size.end()) -
+                     *std::min_element(dims_size.begin(), dims_size.end())
+               : axis;
+    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+        cuda_ctx, ins, outs, axis, func);
+  }
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu
index 16389d7749bf1..ae1f8529db3de 100644
--- a/paddle/pten/kernels/gpu/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -21,7 +21,7 @@ limitations under the License. */
 PT_REGISTER_CTX_KERNEL(full,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Full,
+                       pten::FullKernel,
                        float,
                        double,
                        uint8_t,
@@ -36,7 +36,7 @@ PT_REGISTER_CTX_KERNEL(full,
 PT_REGISTER_CTX_KERNEL(full_like,
                        GPU,
                        ALL_LAYOUT,
-                       pten::FullLike,
+                       pten::FullLikeKernel,
                        float,
                        double,
                        int,
diff --git a/paddle/pten/kernels/gpu/linalg.cu b/paddle/pten/kernels/gpu/linalg.cu
deleted file mode 100644
index c9bc4cbd07962..0000000000000
--- a/paddle/pten/kernels/gpu/linalg.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/pten/kernels/gpu/linalg.h"
-
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/hybird/eigen/dot.h"
-#include "paddle/pten/kernels/hybird/math/matmul_func.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/complex.h"
-
-namespace pten {
-
-template <typename T>
-void Dot(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out) {
-  eigen::Dot<GPUContext, T>(dev_ctx, x, y, out);
-}
-
-template <typename T>
-void Matmul(const GPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            bool transpose_x,
-            bool transpose_y,
-            DenseTensor* out) {
-  PADDLE_ENFORCE_NE(paddle::framework::product(x.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(X) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(paddle::framework::product(y.dims()),
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The Input(Y) dims size must not be equal 0,"
-                        " but reviced dims size is 0. "));
-  math::MatMulFunction<GPUContext, T>(
-      dev_ctx, x, y, out, transpose_x, transpose_y);
-}
-
-}  // namespace pten
-
-using float16 = paddle::platform::float16;
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-
-PT_REGISTER_KERNEL(dot,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Dot,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-
-PT_REGISTER_KERNEL(matmul,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Matmul,
-                   float,
-                   double,
-                   float16,
-                   complex64,
-                   complex128) {}
diff --git a/paddle/pten/kernels/gpu/manipulation.cu b/paddle/pten/kernels/gpu/manipulation.cu
deleted file mode 100644
index 5a82e3e030b9e..0000000000000
--- a/paddle/pten/kernels/gpu/manipulation.cu
+++ /dev/null
@@ -1,137 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/gpu/manipulation.h"
-#include "paddle/pten/kernels/gpu/utils.h"
-#include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h"
-#include "paddle/pten/kernels/hybird/general/manipulation.h"
-
-namespace pten {
-
-template <typename T>
-void Flatten(const GPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out) {
-  auto out_dims = out->dims();
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_dims);
-}
-
-// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
-// Output Tensor，
-// is there a more flexible way to deal with this case?
-template <typename T>
-void FlattenWithXShape(const GPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       int start_axis,
-                       int stop_axis,
-                       DenseTensor* out,
-                       DenseTensor* xshape) {
-  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
-  general::SetXShape(x, xshape);
-}
-
-void Reshape(const GPUContext& dev_ctx,
-             const DenseTensor& x,
-             const ScalarArray& shape,
-             DenseTensor* out) {
-  auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData());
-  if (x.data() == out->data() && x.numel() == out->numel()) {
-    out->Resize(out_meta.dims);
-    return;
-  }
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_meta.dims);
-  out->ResetLoD(x.lod());
-}
-
-void ReshapeWithXShape(const GPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       const ScalarArray& shape,
-                       DenseTensor* xshape,
-                       DenseTensor* out) {
-  general::SetXShape(x, xshape);
-  Reshape(dev_ctx, x, shape, out);
-}
-
-template <typename T>
-void Cast(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out) {
-  PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                       detail::CastCUDAKernelImpl<T, data_t>(dev_ctx, x, out);
-                     }));
-}
-
-}  // namespace pten
-
-using float16 = paddle::platform::float16;
-
-PT_REGISTER_KERNEL(flatten,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Flatten,
-                   float,
-                   float16,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-PT_REGISTER_KERNEL(flatten_with_xshape,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::FlattenWithXShape,
-                   float,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-
-#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL(cast,                              \
-                     GPU,                               \
-                     ALL_LAYOUT,                        \
-                     pten::Cast,                        \
-                     float,                             \
-                     double,                            \
-                     int,                               \
-                     int64_t,                           \
-                     int16_t,                           \
-                     bool,                              \
-                     uint8_t,                           \
-                     paddle::platform::float16,         \
-                     paddle::platform::complex<float>,  \
-                     paddle::platform::complex<double>, \
-                     ##__VA_ARGS__) {                   \
-    kernel->OutputAt(0).SetDataType(                    \
-        paddle::experimental::DataType::UNDEFINED);     \
-  }
-
-#if !defined(PADDLE_WITH_HIP)
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
-#else
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
-#endif
-
-PT_REGISTER_NO_TEMPLATE_KERNEL(reshape, GPU, ANY, pten::Reshape, ALL_DTYPE) {}
-PT_REGISTER_NO_TEMPLATE_KERNEL(
-    reshape_with_xshape, GPU, ANY, pten::ReshapeWithXShape, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/gpu/manipulation.h b/paddle/pten/kernels/gpu/manipulation.h
deleted file mode 100644
index b47fadd70bd17..0000000000000
--- a/paddle/pten/kernels/gpu/manipulation.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace pten {
-
-template <typename T>
-void Flatten(const GPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out);
-
-template <typename T>
-void Cast(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out);
-
-void Reshape(const GPUContext& dev_ctx,
-             const DenseTensor& x,
-             const ScalarArray& shape,
-             DenseTensor* out);
-
-void ReshapeWithXShape(const GPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       const ScalarArray& shape,
-                       DenseTensor* xshape,
-                       DenseTensor* out);
-
-}  // namespace pten
-
-#endif
diff --git a/paddle/pten/kernels/gpu/math.cu b/paddle/pten/kernels/gpu/math.cu
deleted file mode 100644
index 59d816d2377cd..0000000000000
--- a/paddle/pten/kernels/gpu/math.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/kernels/gpu/math.h"
-
-#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
-#include "paddle/pten/kernels/hybird/eigen/sign.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
-#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace pten {
-
-/**
- * Util Functors
- */
-
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n)
-      : n_inv(static_cast<T>(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
-/**
- * Kernels
- */
-
-template <typename T>
-void Sign(const GPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  eigen::Sign<GPUContext, T>(dev_ctx, x, out);
-}
-
-template <typename T>
-void Mean(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out) {
-  auto out_dtype = x.dtype();
-  pten::Reduce<T, paddle::operators::CustomMean>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
-
-template <typename T>
-void Sum(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out) {
-  pten::Reduce<T, paddle::operators::CustomSum>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
-}  // namespace pten
-
-using float16 = paddle::platform::float16;
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-
-PT_REGISTER_KERNEL(sign, GPU, ALL_LAYOUT, pten::Sign, float, double, float16) {}
-PT_REGISTER_KERNEL(mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL(add,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Add,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(subtract,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Subtract,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(divide,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Divide,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(multiply,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Multiply,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128) {}
-PT_REGISTER_KERNEL(sum,
-                   GPU,
-                   ALL_LAYOUT,
-                   pten::Sum,
-                   bool,
-                   float,
-                   double,
-                   float16,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {
-  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
-}
diff --git a/paddle/pten/kernels/gpu/math.h b/paddle/pten/kernels/gpu/math.h
deleted file mode 100644
index 5a872542fbd54..0000000000000
--- a/paddle/pten/kernels/gpu/math.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace pten {
-
-template <typename T>
-void Sign(const GPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
-
-template <typename T>
-void Mean(const GPUContext& dev_ctx,
-          const DenseTensor& x,
-          const std::vector<int64_t>& dims,
-          bool keep_dim,
-          bool reduce_all,
-          DenseTensor* out);
-
-template <typename T>
-void Add(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         int axis,
-         DenseTensor* out);
-
-template <typename T>
-void Subtract(const GPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Divide(const GPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            int axis,
-            DenseTensor* out);
-
-template <typename T>
-void Multiply(const GPUContext& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& y,
-              int axis,
-              DenseTensor* out);
-
-template <typename T>
-void Sum(const GPUContext& dev_ctx,
-         const DenseTensor& x,
-         const std::vector<int64_t>& dims,
-         bool keep_dim,
-         bool reduce_all,
-         DataType out_dtype,
-         DenseTensor* out);
-
-}  // namespace pten
-
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                               \
-  template <typename T>                                                \
-  void name(const GPUContext& dev_ctx,                                 \
-            const DenseTensor& x,                                      \
-            const DenseTensor& y,                                      \
-            int axis,                                                  \
-            DenseTensor* out) {                                        \
-    std::vector<const DenseTensor*> inputs;                            \
-    std::vector<DenseTensor*> outputs;                                 \
-    inputs.emplace_back(&x);                                           \
-    inputs.emplace_back(&y);                                           \
-    outputs.emplace_back(out);                                         \
-    out->mutable_data<T>();                                            \
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(       \
-        dev_ctx, inputs, &outputs, axis, general::name##Functor<T>()); \
-  }
-
-#endif
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
new file mode 100644
index 0000000000000..f41934313d674
--- /dev/null
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -0,0 +1,173 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/math_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/funcs/elementwise_functor.h"
+#include "paddle/pten/kernels/gpu/elementwise.h"
+#include "paddle/pten/kernels/gpu/reduce.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
+  template <typename T, typename Context>                            \
+  void name##Kernel(const Context& dev_ctx,                          \
+                    const DenseTensor& x,                            \
+                    const DenseTensor& y,                            \
+                    int axis,                                        \
+                    DenseTensor* out) {                              \
+    std::vector<const DenseTensor*> inputs;                          \
+    std::vector<DenseTensor*> outputs;                               \
+    inputs.emplace_back(&x);                                         \
+    inputs.emplace_back(&y);                                         \
+    outputs.emplace_back(out);                                       \
+    out->mutable_data<T>();                                          \
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(     \
+        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
+  }
+
+/**
+ * Util Functors
+ */
+
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n)
+      : n_inv(static_cast<T>(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+/**
+ * Kernels
+ */
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                bool reduce_all,
+                DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+// Create the definition of Add
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               bool reduce_all,
+               DataType out_dtype,
+               DenseTensor* out) {
+  pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace pten
+
+using float16 = paddle::platform::float16;
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_CTX_KERNEL(
+    mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {}
+PT_REGISTER_CTX_KERNEL(add,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::AddKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(subtract,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::SubtractKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(divide,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::DivideKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(multiply,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::MultiplyKernel,
+                       float,
+                       double,
+                       int,
+                       int64_t,
+                       bool,
+                       float16,
+                       complex64,
+                       complex128) {}
+PT_REGISTER_CTX_KERNEL(sum,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::SumKernel,
+                       bool,
+                       float,
+                       double,
+                       float16,
+                       int,
+                       int64_t,
+                       complex64,
+                       complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu
new file mode 100644
index 0000000000000..debda455818a9
--- /dev/null
+++ b/paddle/pten/kernels/gpu/matmul_kernel.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/matmul_kernel.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
+
+PT_REGISTER_CTX_KERNEL(matmul,
+                       GPU,
+                       ALL_LAYOUT,
+                       pten::MatmulKernel,
+                       float,
+                       double,
+                       paddle::platform::float16,
+                       paddle::platform::complex<float>,
+                       paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/gpu/reduce.h
similarity index 56%
rename from paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
rename to paddle/pten/kernels/gpu/reduce.h
index bdb883c1df871..5a736ef0e6e72 100644
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 #include <algorithm>
 #include <cmath>
 #include <numeric>
@@ -31,29 +34,28 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/fast_divmod.h"
+#include "paddle/fluid/string/string_helper.h"
 
-#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
 #include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/api/include/tensor.h"
-#include "paddle/pten/kernels/gpu/utils.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/gpu/elementwise.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
 #define REDUCE_SPLIT_BOUNDARY 512
 #define REDUCE_VEC_SIZE 4
 
-namespace pten {
-namespace detail {
-
 namespace kps = paddle::operators::kernel_primitives;
 
+namespace pten {
+namespace kernels {
+
 namespace details {
 
 static inline int GetLastPow2(int n) {
@@ -68,11 +70,11 @@ static inline int GetLastPow2(int n) {
 static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
 
 // get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
-static inline std::vector<int64_t> GetDimStrides(
-    const std::vector<int64_t>& dims, const std::vector<int64_t>& idx) {
+static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
+                                             const std::vector<int>& idx) {
   int n = static_cast<int>(idx.size());
-  if (n == 0) return std::vector<int64_t>();
-  std::vector<int64_t> strides(n);
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
   strides.back() = 1;
   for (int i = n - 2; i >= 0; --i) {
     strides[i] = strides[i + 1] * dims[idx[i + 1]];
@@ -133,9 +135,34 @@ static inline paddle::framework::Array<T, ElementCount> VectorToArray(
   return ret;
 }
 
+static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
+                                            int dim_size,
+                                            bool reduce_all) {
+  std::vector<int> reduce_dims;
+  if (reduce_all) {
+    reduce_dims.resize(dim_size);
+    int reduce_size = reduce_dims.size();
+    for (int i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = i;
+    }
+  } else {
+    for (auto e : dims) {
+      PADDLE_ENFORCE_LT(e,
+                        dim_size,
+                        paddle::platform::errors::InvalidArgument(
+                            "ReduceOp: invalid axis, when x_dims is %d, "
+                            "axis[i] should less than x_dims, but got %d.",
+                            dim_size,
+                            e));
+      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
+    }
+  }
+  return reduce_dims;
+}
+
 }  // namespace details
 
-constexpr int kMaxRank = pten::DDim::kMaxRank;
+constexpr int kMaxRank = paddle::framework::DDim::kMaxRank;
 
 enum ReduceType {
   kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
@@ -145,9 +172,9 @@ enum ReduceType {
 
 struct IndexCalculator {
   IndexCalculator(int dim,
-                  const std::vector<int64_t>& cal_dims,
-                  const std::vector<int64_t>& cal_strides,
-                  const std::vector<int64_t>& full_strides)
+                  const std::vector<int>& cal_dims,
+                  const std::vector<int>& cal_strides,
+                  const std::vector<int>& full_strides)
       : dim(dim) {
     dims = details::VectorToArray<int, kMaxRank>(cal_dims);
     strides = details::VectorToArray<int, kMaxRank>(full_strides);
@@ -275,8 +302,8 @@ struct OneDimIndexCal {
 // reduce config
 template <typename Ty>
 struct ReduceConfig {
-  ReduceConfig(const std::vector<int64_t>& origin_reduce_dims,
-               const std::vector<int64_t>& origin_x_dim)
+  ReduceConfig(const std::vector<int>& origin_reduce_dims,
+               const std::vector<int>& origin_x_dim)
       : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {}
 
   // get the parameters of reduceKernel
@@ -312,17 +339,17 @@ struct ReduceConfig {
   // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1]
   //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
   void SetReduceDim() {
-    std::set<int64_t> reduce_set;
+    std::set<int> reduce_set;
     for (auto e : reduce_dims_origin) {
       auto pos = e >= 0 ? e : e + x_dim.size();
       reduce_set.insert(pos);
     }
 
-    std::vector<int64_t> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
+    std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
     std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
 
     // update reduce_dim and x_dim
-    std::vector<int64_t> x_new_dim;
+    std::vector<int> x_new_dim;
 
     reduce_dim.push_back(reduce_dim_temp[0]);
     x_new_dim.push_back(x_dim[0]);
@@ -355,15 +382,15 @@ struct ReduceConfig {
 
     // update x_dim
     x_dim = x_new_dim;
-    std::vector<int64_t>().swap(x_new_dim);
+    std::vector<int>().swap(x_new_dim);
 
-    std::vector<int64_t> reduce_dim_new;
+    std::vector<int> reduce_dim_new;
     int is_reduced = 0;
     for (auto e : reduce_dim) {
       is_reduced |= 1 << e;
     }
 
-    std::vector<int64_t>().swap(reduce_dim);
+    std::vector<int>().swap(reduce_dim);
 
     for (int i = 0; i < x_dim.size(); i++) {
       if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
@@ -400,7 +427,7 @@ struct ReduceConfig {
   //     --SetStrides--> x_strides= [6,1], reduce_strides = [1],
   //     left_strides = [1]
   void SetStrides() {
-    std::vector<int64_t> idx_dim;
+    std::vector<int> idx_dim;
     for (int i = 0; i < x_dim.size(); i++) {
       idx_dim.push_back(i);
     }
@@ -575,13 +602,13 @@ struct ReduceConfig {
   }
 
  public:
-  std::vector<int64_t> reduce_dims_origin;
-  std::vector<int64_t> reduce_dim;
-  std::vector<int64_t> x_dim;
-  std::vector<int64_t> left_dim;
-  std::vector<int64_t> x_strides;
-  std::vector<int64_t> left_strides;
-  std::vector<int64_t> reduce_strides;
+  std::vector<int> reduce_dims_origin;
+  std::vector<int> reduce_dim;
+  std::vector<int> x_dim;
+  std::vector<int> left_dim;
+  std::vector<int> x_strides;
+  std::vector<int> left_strides;
+  std::vector<int> reduce_strides;
 
   int reduce_type;
   int reduce_num;
@@ -596,15 +623,223 @@ struct ReduceConfig {
   dim3 grid;
 };
 
-template <typename Tx, typename Ty, typename MPType, typename ReduceOp>
+// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
+// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
+// function will be used
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp,
+          typename Calculator>
+__global__ void ReduceAnyKernel(const Tx* x,
+                                Ty* y,
+                                ReduceOp reducer,
+                                TransformOp transformer,
+                                MPType init,
+                                int reduce_num,
+                                int left_num,
+                                bool reduce_last_dim,
+                                const Calculator reduce_index_calculator,
+                                const Calculator left_index_calculator,
+                                const kps::DimConfig dim) {
+  int input_idx, left_idx, stride;
+  int block_size = 0;
+  bool need_store = true;
+  int loop_left = 0;
+  int tid = 0;
+  // the last dim gets involved in reduction
+  int store_offset = 0;
+  int stride_left = 0;
+  if (reduce_last_dim) {
+    auto block = ReduceIndexMapping<true>(dim);
+    input_idx = block.BlockIdY() * block.BlockDimX();
+    left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
+    stride = block.GridDimY() * block.BlockDimX();
+    block_size = block.BlockDimX();
+    need_store = (THREAD_ID_X == 0) && (left_idx < left_num);
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = 1;
+    tid = threadIdx.x;
+  } else {
+    auto block = ReduceIndexMapping<false>(dim);
+    input_idx = block.BlockIdY() * block.BlockDimY();
+    left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
+    stride = block.GridDimY() * block.BlockDimY();
+    block_size = block.BlockDimY();
+    need_store = (THREAD_ID_Y == 0) && (left_idx < left_num);
+    loop_left = min(block.GetLoopSize(), left_num - left_idx);
+    stride_left = block.BlockDimX() * block.GridDimX();
+    store_offset = block.BlockIdY() * left_num + left_idx;
+    tid = threadIdx.y;
+  }
+  // calculate the offset, means the addr where each thread really start.
+  // 1. reduce for each thread
+  MPType input_compute[REDUCE_VEC_SIZE];
+  Tx input_reg[REDUCE_VEC_SIZE];
+  for (int i = 0; i < loop_left; i += stride_left) {
+    int input_offset = left_index_calculator(left_idx + i);
+    const Tx* input = x + input_offset;
+    MPType reduce_var = init;
+    // load REDUCE_VEC_SIZE data once, and then compute
+    int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
+    for (; input_idx + block_size < bound;
+         input_idx += REDUCE_VEC_SIZE * stride) {
+      kps::ReadDataReduce<Tx,
+                          Tx,
+                          1,
+                          REDUCE_VEC_SIZE,
+                          1,
+                          1,
+                          Calculator,
+                          kps::IdentityFunctor<Tx>,
+                          false>(&input_reg[0],
+                                 input,
+                                 input_idx,
+                                 reduce_index_calculator,
+                                 1,
+                                 reduce_num,
+                                 1,
+                                 stride,
+                                 kps::IdentityFunctor<Tx>(),
+                                 reduce_last_dim);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &input_compute[0], &input_reg[0], transformer);
+      kps::Reduce<MPType,
+                  REDUCE_VEC_SIZE,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+    }
+
+    kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
+    kps::ReadDataReduce<Tx,
+                        MPType,
+                        1,
+                        REDUCE_VEC_SIZE,
+                        1,
+                        1,
+                        Calculator,
+                        TransformOp,
+                        true>(&input_compute[0],
+                              input,
+                              input_idx,
+                              reduce_index_calculator,
+                              1,
+                              reduce_num - input_idx,
+                              1,
+                              stride,
+                              transformer,
+                              reduce_last_dim);
+    kps::Reduce<MPType,
+                REDUCE_VEC_SIZE,
+                1,
+                1,
+                ReduceOp,
+                kps::details::ReduceMode::kLocalMode>(
+        &reduce_var, &input_compute[0], reducer, reduce_last_dim);
+
+    kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
+        &reduce_var, &reduce_var, reducer, reduce_last_dim);
+    if (need_store) {
+      y[store_offset + i] = static_cast<Ty>(reduce_var);
+    }
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp>
+__global__ void ReduceHigherDimKernel(const Tx* x,
+                                      Ty* y,
+                                      ReduceOp reducer,
+                                      TransformOp transformer,
+                                      MPType init,
+                                      int reduce_num,
+                                      int left_num,
+                                      int blocking_size,
+                                      const kps::DimConfig dim) {
+  // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
+  // function will be used
+  auto block = ReduceIndexMapping<false>(dim);
+  int idy = block.BlockIdY() * blocking_size;
+  int idx = block.BlockIdX() * block.BlockDimX();
+  int idz = BLOCK_ID_Z * left_num;
+  int stride = dim.split_num_x * dim.deal_size_x;
+  int size = left_num - dim.rem_x;
+  int loop_size = min(reduce_num - idy, blocking_size);
+  int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
+  int block_offset = idy * left_num + idz * reduce_num;
+  const Tx* input = x + block_offset;
+  Tx reduce_input;
+  for (; idx < size; idx += stride) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
+                                            input + loop_idx * left_num + idx,
+                                            block.BlockDimX(),
+                                            1,
+                                            1,
+                                            left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType,
+                  1,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, false>(
+        y + store_offset + idx, &result, block.BlockDimX());
+  }
+
+  if (idx < left_num) {
+    MPType reduce_var = init;
+    MPType reduce_compute = init;
+    for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
+      kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
+                                           input + loop_idx * left_num + idx,
+                                           dim.rem_x,
+                                           1,
+                                           1,
+                                           left_num);
+      kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
+          &reduce_compute, &reduce_input, transformer);
+      kps::Reduce<MPType,
+                  1,
+                  1,
+                  1,
+                  ReduceOp,
+                  kps::details::ReduceMode::kLocalMode>(
+          &reduce_var, &reduce_compute, reducer, false);
+    }
+    Ty result = static_cast<Ty>(reduce_var);
+    kps::WriteData<Ty, 1, 1, 1, true>(
+        y + store_offset + idx, &result, dim.rem_x);
+  }
+}
+
+template <typename Tx,
+          typename Ty,
+          typename MPType,
+          typename ReduceOp,
+          typename TransformOp>
 static void LaunchReduceKernel(const Tx* x_data,
                                Ty* y_data,
                                const ReduceOp& reducer,
+                               const TransformOp& transform,
                                MPType init,
                                gpuStream_t stream,
                                ReduceConfig<Ty> config) {
-  using TransformOp = typename ReduceOp::Transformer;
-
   if (config.reduce_type == kReduceLastDim) {
     int stride_reduce = 1;
     int stride_left = config.reduce_num;
@@ -621,35 +856,33 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU2
-    paddle::operators::ReduceAnyKernel<Tx,
-                                       Ty,
-                                       MPType,
-                                       ReduceOp,
-                                       TransformOp,
-                                       OneDimIndexCal><<<8, 128, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        TransformOp(config.reduce_num),
-        init,
-        config.reduce_num,
-        config.left_num,
-        config.reduce_last_dim,
-        reduce_index_calculator,
-        left_index_calculator,
-        dim);
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    OneDimIndexCal><<<8, 128, stream>>>(x_data,
+                                                        config.output_data,
+                                                        reducer,
+                                                        transform,
+                                                        init,
+                                                        config.reduce_num,
+                                                        config.left_num,
+                                                        config.reduce_last_dim,
+                                                        reduce_index_calculator,
+                                                        left_index_calculator,
+                                                        dim);
 #else
-    paddle::operators::ReduceAnyKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp,
-        TransformOp,
-        OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
-        TransformOp(config.reduce_num),
+        transform,
         init,
         config.reduce_num,
         config.left_num,
@@ -678,16 +911,16 @@ static void LaunchReduceKernel(const Tx* x_data,
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU2
-    paddle::operators::ReduceAnyKernel<Tx,
-                                       Ty,
-                                       MPType,
-                                       ReduceOp,
-                                       TransformOp,
-                                       IndexCalculator><<<8, 128, stream>>>(
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    IndexCalculator><<<8, 128, stream>>>(
         x_data,
         config.output_data,
         reducer,
-        TransformOp(config.reduce_num),
+        transform,
         init,
         config.reduce_num,
         config.left_num,
@@ -696,17 +929,16 @@ static void LaunchReduceKernel(const Tx* x_data,
         left_index_calculator,
         dim);
 #else
-    paddle::operators::ReduceAnyKernel<
-        Tx,
-        Ty,
-        MPType,
-        ReduceOp,
-        TransformOp,
-        IndexCalculator><<<config.grid, config.block, 0, stream>>>(
+    ReduceAnyKernel<Tx,
+                    Ty,
+                    MPType,
+                    ReduceOp,
+                    TransformOp,
+                    IndexCalculator><<<config.grid, config.block, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
-        TransformOp(config.reduce_num),
+        transform,
         init,
         config.reduce_num,
         config.left_num,
@@ -734,23 +966,22 @@ static void LaunchReduceKernel(const Tx* x_data,
         kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
     dim.SetRem(config.left_num % block.x, 0, 0);
 #ifdef PADDLE_WITH_XPU2
-    paddle::operators::ReduceHigherDimKernel<
-        Ty,
-        Ty,
-        MPType,
-        ReduceOp,
-        kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+    ReduceHigherDimKernel<Ty,
+                          Ty,
+                          MPType,
+                          ReduceOp,
+                          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
         config.output_data,
         y_data,
         reducer,
-        kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+        kps::IdentityFunctor<Ty, MPType>(),
         init,
         config.grid.y,
         config.left_num,
         config.grid.y,
         dim);
 #else
-    paddle::operators::ReduceHigherDimKernel<
+    ReduceHigherDimKernel<
         Ty,
         Ty,
         MPType,
@@ -759,7 +990,7 @@ static void LaunchReduceKernel(const Tx* x_data,
         config.output_data,
         y_data,
         reducer,
-        kps::IdentityFunctor<Ty, MPType>(config.grid.y),
+        kps::IdentityFunctor<Ty, MPType>(),
         init,
         config.grid.y,
         config.left_num,
@@ -769,40 +1000,88 @@ static void LaunchReduceKernel(const Tx* x_data,
   }
 }
 
-static void AsyncCopy(const DenseTensor& src, DenseTensor* dst) {
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  const paddle::platform::CUDADeviceContext* dev_ctx;
-  if (paddle::platform::is_gpu_place(dst->place()) ||
-      paddle::platform::is_npu_place(dst->place())) {
-    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(dst->place()));
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+static
+    typename std::enable_if<!std::is_same<Tx, paddle::platform::float16>::value,
+                            void>::type
+    CubTensorReduceFunctorImpl(const Tx* x_data,
+                               Ty* y_data,
+                               const TransformOp& transform,
+                               int reduce_num,
+                               const paddle::platform::Place& place,
+                               gpuStream_t stream) {
+  auto reducer = ReduceOp<Ty>();
+  cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
+                                                                  transform);
+  size_t temp_storage_bytes = 0;
+  cub::DeviceReduce::Reduce(nullptr,
+                            temp_storage_bytes,
+                            trans_x,
+                            y_data,
+                            reduce_num,
+                            reducer,
+                            reducer.initial(),
+                            stream);
 
-  } else {
-    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(src.place()));
-  }
+  pten::DenseTensor tmp = pten::DenseTensor(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(place),
+      pten::DenseTensorMeta(pten::DataType::UINT8,
+                            paddle::framework::make_ddim(
+                                {static_cast<int64_t>(temp_storage_bytes)})));
+
+  auto* temp_storage = tmp.mutable_data<uint8_t>();
+
+  cub::DeviceReduce::Reduce(temp_storage,
+                            temp_storage_bytes,
+                            trans_x,
+                            y_data,
+                            reduce_num,
+                            reducer,
+                            reducer.initial(),
+                            stream);
+}
 
-  pten::Copy(*dev_ctx, src, false, dst);
+template <typename Tx,
+          typename Ty,
+          template <typename> class ReduceOp,
+          typename TransformOp>
+static
+    typename std::enable_if<std::is_same<Tx, paddle::platform::float16>::value,
+                            void>::type
+    CubTensorReduceFunctorImpl(const Tx* x_data,
+                               Ty* y_data,
+                               const TransformOp& transform,
+                               int reduce_num,
+                               const paddle::platform::Place& place,
+                               gpuStream_t stream) {
+  PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+      "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
 
 template <typename Tx,
           typename Ty,
-          template <typename, typename> class ReduceOp>
+          template <typename> class ReduceOp,
+          typename TransformOp>
 void TensorReduceFunctorImpl(const pten::DenseTensor& x,
                              pten::DenseTensor* y,
-                             std::vector<int64_t> origin_reduce_dims,
+                             const TransformOp& transform,
+                             const std::vector<int>& origin_reduce_dims,
                              gpuStream_t stream) {
   // Allocate memory
   y->mutable_data<Ty>();
-  auto x_dim = paddle::framework::vectorize<int64_t>(x.dims());
+
+  auto x_dim = paddle::framework::vectorize<int>(x.dims());
   auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
   config.Run();
-  int64_t numel = x.numel();
+  int numel = x.numel();
   // after config.run()
   // SetOutputData for ReduceHigherDim when should_reduce_again is true,
   // temp_output should be stored temp_data in output_data space or stored in
   // y_data;
+
   pten::DDim tmp_ddim;
   pten::DenseTensor tmp = pten::DenseTensor(
       pten::make_intrusive<paddle::experimental::SharedStorage>(y->place()),
@@ -814,61 +1093,24 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   auto* dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
       paddle::platform::DeviceContextPool::Instance().Get(x.place()));
   if (config.reduce_num == 1) {
-    auto out_dims = y->dims();
-    if (x.dtype() == y->dtype()) {
-      AsyncCopy(x, y);
-      y->Resize(out_dims);
-    } else {
-      PD_VISIT_ALL_TYPES(y->dtype(), "CastKernelImpl", ([&] {
-                           pten::math::CastKernelImpl<GPUContext, Tx, data_t>(
-                               *dev_ctx, x, y);
-                         }));
-    }
+    std::vector<const DenseTensor*> inputs = {&x};
+    std::vector<DenseTensor*> outputs = {y};
+    pten::LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, Tx, Ty>(
+        *dev_ctx, inputs, &outputs, transform);
     return;
   }
 
   config.SetOutputData(y_data, x.place(), &tmp);
-  bool use_cub_reduce = (config.reduce_num == numel) &&
-                        (!std::is_same<Tx, paddle::platform::float16>::value);
+  constexpr bool kIsTxFP16 = std::is_same<Tx, paddle::platform::float16>::value;
+  bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
   if (use_cub_reduce) {
-    // launch CUB::Reduce
-    using TransformOp = typename ReduceOp<Tx, Ty>::Transformer;
-    auto reducer = ReduceOp<Tx, Ty>();
-    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
-        x_data, TransformOp(config.reduce_num));
-    size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr,
-                              temp_storage_bytes,
-                              trans_x,
-                              y_data,
-                              config.reduce_num,
-                              reducer,
-                              reducer.initial(),
-                              stream);
-    // framework::Tensor tmp;
-    pten::DenseTensor tmp = pten::DenseTensor(
-        pten::make_intrusive<paddle::experimental::SharedStorage>(x.place()),
-        pten::DenseTensorMeta(pten::DataType::UINT8,
-                              paddle::framework::make_ddim(
-                                  {static_cast<int64_t>(temp_storage_bytes)}),
-                              x.layout()));
-    auto* temp_storage = tmp.mutable_data<uint8_t>();
-    cub::DeviceReduce::Reduce(temp_storage,
-                              temp_storage_bytes,
-                              trans_x,
-                              y_data,
-                              config.reduce_num,
-                              reducer,
-                              reducer.initial(),
-                              stream);
-
+    CubTensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
+        x_data, y_data, transform, config.reduce_num, x.place(), stream);
     return;
   }
 
-  using MPType =
-      typename paddle::operators::kernel_primitives::details::MPTypeTrait<
-          Ty>::Type;
-  auto reducer = ReduceOp<Tx, MPType>();
+  using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
+  auto reducer = ReduceOp<MPType>();
   // launch ReduceHigherDimKernel
   // when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
   // function will be used
@@ -877,7 +1119,6 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   //     32
   //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
   if (config.reduce_type == ReduceType::kReduceHigherDim) {
-    using TransformOp = typename ReduceOp<Tx, MPType>::Transformer;
     kps::DimConfig dim = kps::DimConfig(config.grid.x,
                                         config.grid.y,
                                         config.grid.z,
@@ -889,31 +1130,30 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
                0);
 
 #ifdef PADDLE_WITH_XPU2
-    paddle::operators::ReduceHigherDimKernel<Tx,
-                                             Ty,
-                                             MPType,
-                                             ReduceOp<Tx, MPType>,
-                                             TransformOp><<<8, 128, stream>>>(
-        x_data,
-        config.output_data,
-        reducer,
-        TransformOp(config.reduce_num),
-        reducer.initial(),
-        config.reduce_num,
-        config.left_num,
-        config.blocking_size,
-        dim);
+    ReduceHigherDimKernel<Tx,
+                          Ty,
+                          MPType,
+                          ReduceOp<MPType>,
+                          TransformOp><<<8, 128, stream>>>(x_data,
+                                                           config.output_data,
+                                                           reducer,
+                                                           transform,
+                                                           reducer.initial(),
+                                                           config.reduce_num,
+                                                           config.left_num,
+                                                           config.blocking_size,
+                                                           dim);
 #else
-    paddle::operators::ReduceHigherDimKernel<
+    ReduceHigherDimKernel<
         Tx,
         Ty,
         MPType,
-        ReduceOp<Tx, MPType>,
+        ReduceOp<MPType>,
         TransformOp><<<config.grid, config.block, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
-        TransformOp(config.reduce_num),
+        transform,
         reducer.initial(),
         config.reduce_num,
         config.left_num,
@@ -929,11 +1169,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
       dim2.SetRem(config.left_num % config.block.x, 0, 0);
 
 #ifdef PADDLE_WITH_XPU2
-      paddle::operators::ReduceHigherDimKernel<
+      ReduceHigherDimKernel<
           Ty,
           Ty,
           MPType,
-          ReduceOp<Tx, MPType>,
+          ReduceOp<MPType>,
           kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
           config.output_data,
           y_data,
@@ -945,11 +1185,11 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
           config.grid.y,
           dim2);
 #else
-      paddle::operators::ReduceHigherDimKernel<
+      ReduceHigherDimKernel<
           Ty,
           Ty,
           MPType,
-          ReduceOp<Tx, MPType>,
+          ReduceOp<MPType>,
           kps::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
           config.output_data,
           y_data,
@@ -968,9 +1208,53 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
   // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
   // function will be used
-  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<Tx, MPType>>(
-      x_data, y_data, reducer, reducer.initial(), stream, config);
+  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<MPType>, TransformOp>(
+      x_data, y_data, reducer, transform, reducer.initial(), stream, config);
 }
 
-}  // namespace detail
+}  // namespace kernels
+
+template <typename T,
+          template <typename> class ReduceOp,
+          template <typename, typename> class TransformOp>
+void Reduce(const GPUContext& dev_ctx,
+            const DenseTensor& x,
+            bool reduce_all,
+            const std::vector<int64_t>& dims,
+            bool keep_dim,
+            DataType out_dtype,
+            DenseTensor* out) {
+  std::vector<int> reduce_dims =
+      pten::kernels::details::GetReduceDim(dims, x.dims().size(), reduce_all);
+
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (x.dims())[i];
+  }
+
+  gpuStream_t stream = dev_ctx.stream();
+
+  if (out_dtype != pten::DataType::UNDEFINED && out_dtype != x.dtype()) {
+    PD_DISPATCH_FLOATING_AND_COMPLEX_AND_2_TYPES(
+        pten::DataType::INT32,
+        pten::DataType::INT64,
+        out_dtype,
+        "TensorReduceFunctorImpl",
+        ([&] {
+          using MPType = typename kps::details::MPTypeTrait<data_t>::Type;
+          pten::kernels::TensorReduceFunctorImpl<T,
+                                                 data_t,
+                                                 ReduceOp,
+                                                 TransformOp<T, MPType>>(
+              x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+        }));
+  } else {
+    using MPType = typename kps::details::MPTypeTrait<T>::Type;
+    pten::kernels::
+        TensorReduceFunctorImpl<T, T, ReduceOp, TransformOp<T, MPType>>(
+            x, out, TransformOp<T, MPType>(reduce_num), reduce_dims, stream);
+  }
+}
 }  // namespace pten
+
+#endif
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index e67fd4cfdccb3..f4bb5c5dbf755 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -16,11 +16,54 @@ limitations under the License. */
 
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/impl/scale_kernel_impl.h"
-
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace pten {
+
+template <typename InT>
+struct ScaleFunctor {
+  InT bias;
+  InT scale;
+  bool bias_after_scale;
+
+  ScaleFunctor(InT scale_data, InT bias_data, bool is_bias_after_sacle) {
+    scale = scale_data;
+    bias = bias_data;
+    bias_after_scale = is_bias_after_sacle;
+  }
+
+  __device__ __forceinline__ InT operator()(const InT& x) const {
+    if (bias_after_scale) {
+      return scale * x + bias;
+    } else {
+      return scale * (x + bias);
+    }
+  }
+};
+
+template <typename T, typename ContextT>
+void Scale(const ContextT& dev_ctx,
+           const DenseTensor& x,
+           const Scalar& scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs;
+  std::vector<DenseTensor*> outputs;
+  inputs.emplace_back(&x);
+  outputs.emplace_back(out);
+  out->mutable_data<T>();
+  LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+      dev_ctx,
+      inputs,
+      &outputs,
+      ScaleFunctor<T>(scale.to<T>(), static_cast<T>(bias), bias_after_scale));
+}
+
+}  // namespace pten
+
 PT_REGISTER_CTX_KERNEL(scale,
                        GPU,
                        ALL_LAYOUT,
diff --git a/paddle/pten/kernels/gpu/utils.h b/paddle/pten/kernels/gpu/sign_kernel.cu
similarity index 69%
rename from paddle/pten/kernels/gpu/utils.h
rename to paddle/pten/kernels/gpu/sign_kernel.cu
index 3a455ad70c4dc..42b39141460fe 100644
--- a/paddle/pten/kernels/gpu/utils.h
+++ b/paddle/pten/kernels/gpu/sign_kernel.cu
@@ -12,22 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/pten/kernels/impl/sign_kernel_impl.h"
+#include "paddle/pten/kernels/sign_kernel.h"
 
 #include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-namespace pten {
-
-void Copy(const GPUContext& dev_ctx,
-          const DenseTensor& src,
-          bool blocking,
-          DenseTensor* dst);
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/float16.h"
 
-}  // namespace pten
+using float16 = paddle::platform::float16;
 
-#endif
+PT_REGISTER_CTX_KERNEL(
+    sign, GPU, ALL_LAYOUT, pten::Sign, float, double, float16) {}
diff --git a/paddle/pten/kernels/hybird/CMakeLists.txt b/paddle/pten/kernels/hybird/CMakeLists.txt
index 1304aa1798c0c..5d04bae2eae82 100644
--- a/paddle/pten/kernels/hybird/CMakeLists.txt
+++ b/paddle/pten/kernels/hybird/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_subdirectory(eigen)
-add_subdirectory(blas)
 add_subdirectory(general)
 
 cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context)
diff --git a/paddle/pten/kernels/hybird/blas/CMakeLists.txt b/paddle/pten/kernels/hybird/blas/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/pten/kernels/hybird/blas/elementwise.h b/paddle/pten/kernels/hybird/blas/elementwise.h
deleted file mode 100644
index 1a530c9f8e940..0000000000000
--- a/paddle/pten/kernels/hybird/blas/elementwise.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace pten {
-namespace blas {
-
-template <typename DevCtx, typename T>
-void ElementwiseAdd(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-  blas.VADD(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseSub(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-  blas.VSUB(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseDiv(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-  blas.VDIV(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseMul(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
-  blas.VMUL(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
-}
-}  // namespace blas
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cpu/CMakeLists.txt b/paddle/pten/kernels/hybird/cpu/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/pten/kernels/hybird/cpu/elementwise.h b/paddle/pten/kernels/hybird/cpu/elementwise.h
deleted file mode 100644
index d503957a76262..0000000000000
--- a/paddle/pten/kernels/hybird/cpu/elementwise.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
-
-namespace pten {
-
-inline void UpdateElementwiseIndexArray(const int *out_dims_array,
-                                        const int max_dim,
-                                        int *index_array) {
-  for (int i = max_dim - 1; i >= 0; --i) {
-    ++index_array[i];
-    if (index_array[i] >= out_dims_array[i]) {
-      index_array[i] -= out_dims_array[i];
-    } else {
-      break;
-    }
-  }
-}
-
-inline int GetElementwiseIndex(const int *x_dims_array,
-                               const int max_dim,
-                               const int *index_array) {
-  int index_ = 0;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] > 1) {
-      index_ = index_ * x_dims_array[i] + index_array[i];
-    }
-  }
-  return index_;
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonForwardBroadcastCPU(const DenseTensor &x,
-                               const DenseTensor &y,
-                               DenseTensor *z,
-                               int *x_dims_array,
-                               int *y_dims_array,
-                               int *out_dims_array,
-                               int max_dim,
-                               const paddle::platform::CPUDeviceContext &ctx,
-                               Functor func,
-                               const bool is_xsize_larger = true) {
-  std::vector<int> index_array(max_dim, 0);
-  const T *x_data = x.data<T>();
-  const T *y_data = y.data<T>();
-  PADDLE_ENFORCE_NOT_NULL(x_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input X should not be empty."));
-  PADDLE_ENFORCE_NOT_NULL(y_data,
-                          paddle::platform::errors::InvalidArgument(
-                              "The input Y should not be empty."));
-  OutType *out_data = z->mutable_data<OutType>();
-
-  const int out_size = std::accumulate(
-      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (is_xsize_larger) {
-      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
-    } else {
-      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonElementwiseBroadcastForward(
-    const paddle::platform::CPUDeviceContext &dev_ctx,
-    const DenseTensor &x,
-    const DenseTensor &y,
-    DenseTensor *z,
-    const DDim &x_dims,
-    const DDim &y_dims,
-    Functor func,
-    int axis,
-    const bool is_xsize_larger = true) {
-  int max_dim = (std::max)(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      paddle::platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  general::GetBroadcastDimsArrays(x_dims,
-                                  y_dims,
-                                  x_dims_array.data(),
-                                  y_dims_array.data(),
-                                  out_dims_array.data(),
-                                  max_dim,
-                                  axis);
-
-  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
-                                                 y,
-                                                 z,
-                                                 x_dims_array.data(),
-                                                 y_dims_array.data(),
-                                                 out_dims_array.data(),
-                                                 max_dim,
-                                                 dev_ctx,
-                                                 func,
-                                                 is_xsize_larger);
-}
-
-// It is a common CPU implementation to compute binary calculation with the
-// support of broadcast. Note:
-// 1. CPU implementation cannot support the case when x needs broadcast, thus
-//    this function need to be called with XxxFunctor and XxxInverseFunctor,
-//    like AddFunctor and InverseAddFunctor.
-// 2. The corresponding GPU implementation supports all the broadcast cases,
-//    thus there is no need to define and call with XxxInverseFunctor.
-// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
-// cases and avoid the need of XxxInverseFunctor.
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx,
-                        const DenseTensor &x,
-                        const DenseTensor &y,
-                        int axis,
-                        Functor func,
-                        DenseTensor *z) {
-  z->mutable_data<OutType>();
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  bool is_xsize_larger = true;
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-  general::
-      TransformFunctor<Functor, T, paddle::platform::CPUDeviceContext, OutType>
-          functor(x, y, z, dev_ctx, func, is_xsize_larger);
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      paddle::platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    paddle::platform::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = general::trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    general::get_mid_dims(x_dims,
-                          y_dims_trimed,
-                          axis_trim,
-                          &pre,
-                          &n,
-                          &post,
-                          &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = general::trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    general::get_mid_dims(y_dims,
-                          x_dims_trimed,
-                          axis_trim,
-                          &pre,
-                          &n,
-                          &post,
-                          &is_run_common_broadcast);
-  }
-  // special case for common implementation.
-  // case 1: x=[2,3,1,5], y=[2,1,4,1]
-  // case 2: x=[2,3,4], y=[1,1,4]
-  if (is_run_common_broadcast == 1) {
-    CommonElementwiseBroadcastForward<Functor, T, OutType>(
-        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
-    return;
-  }
-
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
-}
-
-template <typename Functor>
-struct SameDimsElementwiseCompute {
-  void operator()(const paddle::platform::CPUDeviceContext &dev_ctx,
-                  const DenseTensor &x,
-                  const DenseTensor &y,
-                  DenseTensor *z) {
-    Functor()(dev_ctx, x, y, z);
-  }
-};
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h b/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
deleted file mode 100644
index d8c58448c9867..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-#include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-namespace pten {
-namespace detail {
-
-template <typename InT, typename OutT, int VecSize>
-__global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
-  using LoadT = paddle::platform::AlignedVector<InT, VecSize>;
-  using StoreT = paddle::platform::AlignedVector<OutT, VecSize>;
-
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int64_t i = idx * VecSize; i < N;
-       i += blockDim.x * gridDim.x * VecSize) {
-    LoadT in_val;
-    paddle::platform::Load<InT, VecSize>(&in[i], &in_val);
-
-    StoreT out_val;
-#pragma unroll
-    for (int j = 0; j < VecSize; j++) {
-      out_val[j] = static_cast<OutT>(in_val[j]);
-    }
-
-    paddle::platform::Store<OutT, VecSize>(out_val, &out[i]);
-  }
-}
-
-template <typename InT, typename OutT>
-__global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
-  CUDA_KERNEL_LOOP(index, N) { out[index] = static_cast<OutT>(in[index]); }
-}
-
-template <typename InT, typename OutT>
-void CastCUDAKernelImpl(const GPUContext& dev_ctx,
-                        const DenseTensor& x,
-                        DenseTensor* out) {
-  auto* in_data = x.data<InT>();
-  auto size = x.numel();
-  auto* out_data = out->mutable_data<OutT>();
-
-  paddle::platform::GpuLaunchConfig config =
-      paddle::platform::GetGpuLaunchConfig1D(dev_ctx, size);
-  int vec_size = paddle::platform::GetVectorizedSize<OutT>(out_data);
-  if (!std::is_same<InT, OutT>::value && vec_size == 4 && size % 4 == 0) {
-    VecCastCUDAKernel<InT, OutT, 4><<<config.block_per_grid,
-                                      config.thread_per_block,
-                                      0,
-                                      dev_ctx.stream()>>>(
-        in_data, size, out_data);
-  } else {
-    CastCUDAKernel<InT, OutT><<<config.block_per_grid,
-                                config.thread_per_block,
-                                0,
-                                dev_ctx.stream()>>>(in_data, size, out_data);
-  }
-}
-
-}  // namespace detail
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
deleted file mode 100644
index 0ef2ee2fdf1f4..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h"
-
-namespace pten {
-
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
-void LaunchElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &cuda_ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
-  std::vector<int> dims_size;
-  bool no_broadcast_flag = true;
-  for (auto *in : ins) {
-    no_broadcast_flag &= ins[0]->dims() == in->dims();
-    dims_size.emplace_back(in->dims().size());
-  }
-  if (no_broadcast_flag) {
-    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT>(
-        cuda_ctx, ins, outs, func);
-  } else {
-    axis = axis == -1
-               ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                     *std::min_element(dims_size.begin(), dims_size.end())
-               : axis;
-    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT>(
-        cuda_ctx, ins, outs, axis, func);
-  }
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
deleted file mode 100644
index ccdeb70002bfc..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h"
-
-namespace pten {
-
-struct DimensionsTransform {
-  using DimVector = std::vector<int64_t>;
-  typedef void (*MergeFunctor)(
-      bool &, std::vector<DimVector> &, DimVector &, int, int);
-  int64_t dim_size;
-  DimVector out_dims;
-  std::vector<DimVector> in_dims;
-
- private:
-  // To compensate the lackage of input_tensors` dimension with input variable
-  // 'axis'
-  void InputDimensionsExtend(int N, int axis) {
-    for (auto &in_dim : in_dims) {
-      int64_t in_idx = 0;
-      if (in_dim.size() < dim_size) {
-        DimVector tmp_dim(dim_size, 1);
-        do {
-          if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) {
-            tmp_dim[axis] = in_dim[in_idx];
-            in_idx++;
-            axis++;
-          } else {
-            PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-                "The %d-th dimension of input tensor is expected to be equal "
-                "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
-                in_idx + 1,
-                axis + 1,
-                out_dims[axis],
-                in_dim[in_idx]));
-          }
-        } while (in_idx < in_dim.size());
-        in_dim.resize(dim_size);
-        std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin());
-      } else {
-        do {
-          if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) {
-            in_idx++;
-          } else {
-            PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-                "The %d-th dimension of input tensor is expected to be equal "
-                "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
-                in_idx + 1,
-                in_idx + 1,
-                out_dims[in_idx],
-                in_dim[in_idx]));
-          }
-        } while (in_idx < dim_size);
-      }
-      std::reverse(in_dim.begin(), in_dim.end());
-    }
-    std::reverse(out_dims.begin(), out_dims.end());
-  }
-
-  template <typename MergeFunctor>
-  __inline__ void MergeDimensions(MergeFunctor merge_func, int N) {
-    auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) {
-      (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx,
-                                          vec->begin() + m_idx,
-                                          1,
-                                          std::multiplies<int64_t>());
-      vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1);
-    };
-
-    int64_t i = 0;
-    while (i < dim_size) {
-      int cnt = 0;
-      int low_idx = i;
-      bool equal = true;
-      do {
-        merge_func(equal, in_dims, out_dims, i, N);
-        if (equal) {
-          i++;
-          cnt++;
-        } else {
-          break;
-        }
-      } while (i < dim_size);
-
-      if (cnt > 1) {
-        for (auto &in_dim : in_dims) {
-          VectorReorganise(&in_dim, low_idx, i);
-        }
-        VectorReorganise(&out_dims, low_idx, i);
-        dim_size -= --cnt;
-        i -= cnt;
-      } else if (cnt < 1) {
-        i++;
-      }
-    }
-  }
-
- public:
-  explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
-                               const paddle::framework::DDim &dims,
-                               int axis) {
-    const int N = ins.size();
-    dim_size = dims.size();
-    out_dims = paddle::framework::vectorize<int64_t>(dims);
-    in_dims.resize(N);
-    for (int j = 0; j < N; ++j) {
-      in_dims[j] = paddle::framework::vectorize<int64_t>(ins[j]->dims());
-    }
-    InputDimensionsExtend(N, axis);
-
-    auto merge_sequential_dims = [](bool &equal,
-                                    std::vector<DimVector> &in_dims,
-                                    DimVector &out,
-                                    int i,
-                                    int num) {
-      for (int j = 1; j < num; ++j) {
-        equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false;
-      }
-    };
-    auto merge_sequential_one_dims = [](bool &equal,
-                                        std::vector<DimVector> &in_dims,
-                                        DimVector &out,
-                                        int i,
-                                        int num) {
-      equal = in_dims[0][i] == 1;
-      if (equal) {
-        for (int j = 1; j < num; ++j) {
-          equal &= in_dims[j][i] == out[i];
-        }
-      }
-    };
-    // To Merge the dimensions of input_tensors while the consequtive
-    // equal-dimensions appears.
-    MergeFunctor merge_ptr = merge_sequential_dims;
-    MergeDimensions<MergeFunctor>(merge_ptr, N);
-
-    int min_idx = 0;
-    int min_val = std::accumulate(
-        in_dims[0].begin(), in_dims[0].end(), 1, std::multiplies<int64_t>());
-    for (int j = 1; j < N; ++j) {
-      int temp = std::accumulate(
-          in_dims[j].begin(), in_dims[j].end(), 1, std::multiplies<int64_t>());
-      min_val = min_val > temp ? temp : min_val;
-      min_idx = min_val == temp ? j : min_idx;
-    }
-    std::swap(in_dims[0], in_dims[min_idx]);
-
-    // To Merge the dimension of input_tensors while the consequtive
-    // 1-value-dimensions appears.
-    merge_ptr = merge_sequential_one_dims;
-    MergeDimensions<MergeFunctor>(merge_ptr, N);
-    std::swap(in_dims[min_idx], in_dims[0]);
-  }
-};
-
-template <typename T, int VecSize, int Rank, bool IsBoundary = false>
-__device__ __forceinline__ void LoadData(
-    T *dst,
-    const T *__restrict__ src,
-    uint32_t block_offset,
-    const kps::details::BroadcastConfig<Rank> &config,
-    int numel,
-    int num,
-    bool need_broadcast) {
-  // numel : whole num of output
-  // num: how many data will be deal with in this time
-  if (need_broadcast) {
-    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(
-        dst, src, block_offset, config, numel);
-  } else {
-    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
-  }
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int VecSize,
-          int Rank,
-          bool IsBoundary = false>
-__device__ void ElementwiseBroadcastKernelImpl(
-    const paddle::framework::Array<const InT *__restrict__, Arity> &ins,
-    OutT *out,
-    const paddle::framework::Array<bool, Arity> &use_broadcast,
-    uint32_t numel,
-    const paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
-        &configs,
-    int num,
-    int block_offset,
-    Functor func) {
-  InT args[Arity][VecSize];
-  OutT result[VecSize];
-
-#pragma unroll
-  for (int i = 0; i < Arity; i++) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    LoadData<InT, VecSize, Rank, IsBoundary>(args[i],
-                                             ins[i],
-                                             block_offset,
-                                             configs[i],
-                                             numel,
-                                             num,
-                                             use_broadcast[i]);
-  }
-
-  const bool kCallElementwiseAny =
-      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
-  ElementwisePrimitiveCaller<InT,
-                             OutT,
-                             VecSize,
-                             Functor,
-                             Arity,
-                             kCallElementwiseAny>()(func, args, result);
-  kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
-      out + block_offset, result, num);
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int VecSize,
-          int Rank>
-__global__ void ElementwiseBroadcastKernel(
-    paddle::framework::Array<const InT *__restrict__, Arity> ins,
-    OutT *out,
-    paddle::framework::Array<bool, Arity> use_broadcast,
-    uint32_t numel,
-    paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
-        configs,
-    int main_offset,
-    int tail_tid,
-    Functor func) {
-  int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
-#ifdef PADDLE_WITH_XPU2
-  for (; block_offset < main_offset; block_offset += stride) {
-    ElementwiseBroadcastKernelImpl<InT,
-                                   OutT,
-                                   Functor,
-                                   Arity,
-                                   VecSize,
-                                   Rank,
-                                   false>(ins,
-                                          out,
-                                          use_broadcast,
-                                          numel,
-                                          configs,
-                                          BLOCK_NUM_X * VecSize,
-                                          block_offset,
-                                          func);
-  }
-  if (block_offset < numel) {
-    ElementwiseBroadcastKernelImpl<InT,
-                                   OutT,
-                                   Functor,
-                                   Arity,
-                                   VecSize,
-                                   Rank,
-                                   true>(
-        ins, out, use_broadcast, numel, configs, tail_tid, block_offset, func);
-  }
-
-#else
-  if (block_offset < main_offset) {
-    ElementwiseBroadcastKernelImpl<InT,
-                                   OutT,
-                                   Functor,
-                                   Arity,
-                                   VecSize,
-                                   Rank,
-                                   false>(ins,
-                                          out,
-                                          use_broadcast,
-                                          numel,
-                                          configs,
-                                          BLOCK_NUM_X * VecSize,
-                                          block_offset,
-                                          func);
-  } else {
-    ElementwiseBroadcastKernelImpl<InT,
-                                   OutT,
-                                   Functor,
-                                   Arity,
-                                   VecSize,
-                                   Rank,
-                                   true>(
-        ins, out, use_broadcast, numel, configs, tail_tid, block_offset, func);
-  }
-#endif
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int VecSize,
-          int Rank>
-void LaunchKernel(const paddle::platform::CUDADeviceContext &ctx,
-                  const std::vector<const DenseTensor *> &ins,
-                  DenseTensor *out,
-                  Functor func,
-                  DimensionsTransform merge_dims) {
-  int numel = out->numel();
-  const int threads = 256;
-  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
-
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
-  auto stream = ctx.stream();
-  OutT *out_data = out->mutable_data<OutT>();
-
-  paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
-  paddle::framework::Array<bool, Arity> use_broadcast;
-  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
-
-  for (int i = 0; i < Arity; i++) {
-    use_broadcast[i] = (ins[i]->numel() != numel);
-    ins_data[i] = ins[i]->data<InT>();
-    if (use_broadcast[i]) {
-      // get the broadcast config,
-      // if data shape is[m, n], then you should set data_dim = {n, m}
-      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
-      configs[i] = kps::details::BroadcastConfig<Rank>(
-          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
-    }
-  }
-#ifdef PADDLE_WITH_XPU2
-  threads = 128;
-  blocks = 8;
-  main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  tail_tid = numel % (VecSize * threads);
-  ElementwiseBroadcastKernel<InT,
-                             OutT,
-                             Functor,
-                             Arity,
-                             VecSize,
-                             Rank><<<blocks, threads, stream>>>(ins_data,
-                                                                out_data,
-                                                                use_broadcast,
-                                                                numel,
-                                                                configs,
-                                                                main_offset,
-                                                                tail_tid,
-                                                                func);
-#else
-  ElementwiseBroadcastKernel<InT,
-                             OutT,
-                             Functor,
-                             Arity,
-                             VecSize,
-                             Rank><<<blocks, threads, 0, stream>>>(
-      ins_data,
-      out_data,
-      use_broadcast,
-      numel,
-      configs,
-      main_offset,
-      tail_tid,
-      func);
-#endif
-}
-
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
-void LaunchBroadcastKernelForDifferentVecSize(
-    const paddle::platform::CUDADeviceContext &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    DenseTensor *out,
-    int axis,
-    Functor func) {
-  const auto merge_dims = DimensionsTransform(ins, out->dims(), axis);
-
-#define CALL_BROADCAST_FOR_DIM_SIZE(rank)                   \
-  case rank: {                                              \
-    LaunchKernel<InT, OutT, Functor, Arity, VecSize, rank>( \
-        ctx, ins, out, func, merge_dims);                   \
-  } break;
-
-  switch (merge_dims.dim_size) {
-    CALL_BROADCAST_FOR_DIM_SIZE(1);
-    CALL_BROADCAST_FOR_DIM_SIZE(2);
-    CALL_BROADCAST_FOR_DIM_SIZE(3);
-    CALL_BROADCAST_FOR_DIM_SIZE(4);
-    CALL_BROADCAST_FOR_DIM_SIZE(5);
-    CALL_BROADCAST_FOR_DIM_SIZE(6);
-    CALL_BROADCAST_FOR_DIM_SIZE(7);
-    CALL_BROADCAST_FOR_DIM_SIZE(8);
-    default: {
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-          "The maximum dimension of input tensor is expected to be less than "
-          "%d, but recieved %d.\n",
-          merge_dims.dim_size,
-          paddle::framework::DDim::kMaxRank));
-    }
-  }
-#undef CALL_BROADCAST_FOR_DIM_SIZE
-}
-
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
-void LaunchBroadcastElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
-  using Traits = paddle::platform::FunctionTraits<Functor>;
-  const int kArity =
-      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(ins.size(),
-                    kArity,
-                    paddle::platform::errors::InvalidArgument(
-                        "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
-                        "is %d, the arity of functor is %d.",
-                        ins.size(),
-                        kArity));
-  PADDLE_ENFORCE_EQ(kArity,
-                    2,
-                    paddle::platform::errors::InvalidArgument(
-                        "Currently only broadcast of binary is supported and "
-                        "verified, but received %d.",
-                        kArity));
-
-  int in_vec_size = 4;
-  DenseTensor *out = (*outs)[0];
-  for (auto *in : ins) {
-    auto temp_size = paddle::platform::GetVectorizedSize<InT>(in->data<InT>());
-    in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size)
-                                            : in_vec_size;
-  }
-  int out_vec_size =
-      paddle::platform::GetVectorizedSize<OutT>(out->data<OutT>());
-  int vec_size = std::min(out_vec_size, in_vec_size);
-
-  switch (vec_size) {
-    case 4: {
-      LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 4>(
-          ctx, ins, out, axis, func);
-      break;
-    }
-    case 2: {
-      LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 2>(
-          ctx, ins, out, axis, func);
-      break;
-    }
-    case 1: {
-      LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 1>(
-          ctx, ins, out, axis, func);
-      break;
-    }
-    default: {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported vectorized size: %d !", vec_size));
-      break;
-    }
-  }
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
deleted file mode 100644
index 053b53041d165..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/function_traits.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
-
-namespace pten {
-namespace kps = paddle::operators::kernel_primitives;
-enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 };
-
-template <typename InT,
-          typename OutT,
-          int VecSize,
-          typename Functor,
-          int Arity,
-          bool CallElementwiseAny = false>
-struct ElementwisePrimitiveCaller {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result);
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor, int Arity>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result) {
-    kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(
-        result, args, func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result) {
-    kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(
-        result, args[0], func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result) {
-    kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
-        result, args[0], args[1], func);
-  }
-};
-
-template <typename InT, typename OutT, int VecSize, typename Functor>
-struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
-  __device__ inline void operator()(Functor func,
-                                    InT (*args)[VecSize],
-                                    OutT *result) {
-    kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
-        result, args[0], args[1], args[2], func);
-  }
-};
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
deleted file mode 100644
index e2659271bdcd9..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h"
-
-#ifdef __HIPCC__
-#define ELEMENTWISE_BLOCK_SIZE 256
-#else
-#define ELEMENTWISE_BLOCK_SIZE 512
-#endif
-
-namespace pten {
-
-/*
-* According to NVIDIA, if number of threads per block is 64/128/256/512,
-* cuda performs better. And number of blocks should be greater (at least
-* 2x~4x) than number of SMs. Hence, SM count is took into account within
-* this function to determine the right number of threads per block.
-*/
-inline int GetThreadsConfig(const paddle::platform::CUDADeviceContext &ctx,
-                            int64_t numel,
-                            int vec_size) {
-  int threads = ELEMENTWISE_BLOCK_SIZE;
-  int sm_count = ctx.GetSMCount();
-  int active_threads_num = numel / vec_size;
-  if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about twice of SM, to acquire better performance.
-    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
-                                                  (sm_count << 1));
-  } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about 4 times of SM, to acquire better performance.
-    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
-                                                  (sm_count << 2));
-  }
-  // Number of threads per block shall be larger than 64.
-  return std::max(64, threads);
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int VecSize,
-          bool IsBoundary>
-__device__ void VectorizedElementwiseKernelImpl(
-    const paddle::framework::Array<const InT *__restrict__, Arity> &in,
-    OutT *out,
-    int num,
-    int data_offset,
-    Functor func) {
-  InT args[Arity][VecSize];
-  OutT result[VecSize];
-
-#pragma unroll
-  for (int i = 0; i < Arity; i++) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(
-        args[i], in[i] + data_offset, num);
-  }
-
-  const bool kCallElementwiseAny =
-      paddle::platform::FunctionTraits<Functor>::has_pointer_args;
-  ElementwisePrimitiveCaller<InT,
-                             OutT,
-                             VecSize,
-                             Functor,
-                             Arity,
-                             kCallElementwiseAny>()(func, args, result);
-  kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
-      out + data_offset, result, num);
-}
-
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
-__global__ void VectorizedElementwiseKernel(
-    paddle::framework::Array<const InT *__restrict__, Arity> ins,
-    OutT *out,
-    int size,
-    int main_offset,
-    Functor func) {
-  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
-  for (; data_offset < main_offset; data_offset += stride) {
-    VectorizedElementwiseKernelImpl<InT, OutT, Functor, Arity, VecSize, false>(
-        ins, out, VecSize * BLOCK_NUM_X, data_offset, func);
-  }
-
-  int num = size - data_offset;
-  if (num > 0) {
-    VectorizedElementwiseKernelImpl<InT, OutT, Functor, Arity, VecSize, true>(
-        ins, out, num, data_offset, func);
-  }
-}
-
-template <typename InT, typename OutT>
-int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
-                                const std::vector<DenseTensor *> &outs) {
-  int vec_size = 4;
-  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<InT>()));
-  }
-  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
-    vec_size = std::min<int>(
-        vec_size, paddle::platform::GetVectorizedSize((*iter)->data<OutT>()));
-  }
-  return vec_size;
-}
-
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
-void ElementwiseCudaKernel(const paddle::platform::CUDADeviceContext &ctx,
-                           const std::vector<const DenseTensor *> &ins,
-                           std::vector<DenseTensor *> *outs,
-                           Functor func) {
-  auto numel = ins[0]->numel();
-  int block_size = GetThreadsConfig(ctx, numel, VecSize);
-  int grid_size =
-      ((numel + VecSize - 1) / VecSize + block_size - 1) / block_size;
-  auto stream = ctx.stream();
-  OutT *out_data = (*outs)[0]->mutable_data<OutT>();
-  paddle::framework::Array<const InT *__restrict__, Arity> ins_data;
-  for (int i = 0; i < Arity; i++) {
-    ins_data[i] = ins[i]->data<InT>();
-  }
-#ifdef PADDLE_WITH_XPU2
-  block_size = 128;
-  grid_size = 8;
-  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
-  VectorizedElementwiseKernel<InT,
-                              OutT,
-                              Functor,
-                              Arity,
-                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, out_data, numel, main_offset, func);
-#else
-  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
-  VectorizedElementwiseKernel<InT,
-                              OutT,
-                              Functor,
-                              Arity,
-                              VecSize><<<grid_size, block_size, 0, stream>>>(
-      ins_data, out_data, numel, main_offset, func);
-#endif
-}
-
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
-void LaunchSameDimsElementwiseCudaKernel(
-    const paddle::platform::CUDADeviceContext &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    Functor func) {
-  using Traits = paddle::platform::FunctionTraits<Functor>;
-  const int kArity =
-      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(ins.size(),
-                    kArity,
-                    paddle::platform::errors::InvalidArgument(
-                        "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
-                        "is %d, the arity of functor is %d.",
-                        ins.size(),
-                        kArity));
-  // calculate the max vec_size for all ins and outs
-  int vec_size = GetVectorizedSizeForTensors<InT, OutT>(ins, *outs);
-  switch (vec_size) {
-    case 4:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, 4>(
-          ctx, ins, outs, func);
-      break;
-    case 2:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, 2>(
-          ctx, ins, outs, func);
-      break;
-    case 1:
-      ElementwiseCudaKernel<InT, OutT, Functor, kArity, 1>(
-          ctx, ins, outs, func);
-      break;
-    default: {
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported vectorized size: %d !", vec_size));
-      break;
-    }
-  }
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
deleted file mode 100644
index 793e8505ec606..0000000000000
--- a/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h"
-
-namespace pten {
-
-static inline std::vector<int64_t> GetReduceDim(
-    const std::vector<int64_t>& dims, int dim_size, bool reduce_all) {
-  std::vector<int64_t> reduce_dims;
-  if (reduce_all) {
-    reduce_dims.resize(dim_size);
-    int reduce_size = reduce_dims.size();
-    for (int i = 0; i < reduce_size; ++i) {
-      reduce_dims[i] = i;
-    }
-  } else {
-    for (auto e : dims) {
-      PADDLE_ENFORCE_LT(e,
-                        dim_size,
-                        paddle::platform::errors::InvalidArgument(
-                            "ReduceOp: invalid axis, when x_dims is %d, "
-                            "axis[i] should less than x_dims, but got %d.",
-                            dim_size,
-                            e));
-      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
-    }
-  }
-  return reduce_dims;
-}
-
-template <typename T, template <typename, typename> class ReduceFunctor>
-void Reduce(const GPUContext& dev_ctx,
-            const DenseTensor& x,
-            bool reduce_all,
-            const std::vector<int64_t>& dims,
-            bool keep_dim,
-            DataType out_dtype,
-            DenseTensor* out) {
-  std::vector<int64_t> reduce_dims =
-      GetReduceDim(dims, x.dims().size(), reduce_all);
-
-  gpuStream_t stream = dev_ctx.stream();
-
-  if (out_dtype != pten::DataType::UNDEFINED && out_dtype != x.dtype()) {
-    PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
-        out_dtype, "TensorReduceFunctorImpl", ([&] {
-          pten::detail::TensorReduceFunctorImpl<T, data_t, ReduceFunctor>(
-              x, out, reduce_dims, stream);
-        }));
-  } else {
-    pten::detail::TensorReduceFunctorImpl<T, T, ReduceFunctor>(
-        x, out, reduce_dims, stream);
-  }
-}
-
-}  // namespace pten
-
-#endif
diff --git a/paddle/pten/kernels/hybird/eigen/dot.h b/paddle/pten/kernels/hybird/eigen/dot.h
deleted file mode 100644
index eb089037fa3f3..0000000000000
--- a/paddle/pten/kernels/hybird/eigen/dot.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace pten {
-namespace eigen {
-
-template <typename DevCtx, typename T>
-void Dot(const DevCtx& dev_ctx,
-         const DenseTensor& x,
-         const DenseTensor& y,
-         DenseTensor* out) {
-  out->mutable_data<T>();
-  if (1 == out->dims().size()) {
-    auto eigen_out = pten::EigenScalar<T>::From(*out);
-    auto eigen_x = pten::EigenVector<T>::Flatten(x);
-    auto eigen_y = pten::EigenVector<T>::Flatten(y);
-
-    auto& dev = *dev_ctx.eigen_device();
-    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
-  } else {
-    auto eigen_out = pten::EigenMatrix<T>::From(*out);
-    auto eigen_x = pten::EigenMatrix<T>::From(x);
-    auto eigen_y = pten::EigenMatrix<T>::From(y);
-
-    auto& dev = *dev_ctx.eigen_device();
-    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
-  }
-}
-
-}  // namespace eigen
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/eigen/elementwise.h b/paddle/pten/kernels/hybird/eigen/elementwise.h
deleted file mode 100644
index e67cce63d461f..0000000000000
--- a/paddle/pten/kernels/hybird/eigen/elementwise.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/common.h"
-
-namespace pten {
-namespace eigen {
-
-template <typename DevCtx, typename T>
-void ElementwiseAdd(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  out->mutable_data<T>();
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto eigen_y = pten::EigenVector<T>::Flatten(y);
-  auto eigen_z = pten::EigenVector<T>::Flatten(*out);
-  auto& place = *dev_ctx.eigen_device();
-  eigen_z.device(place) = eigen_x + eigen_y;
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseSub(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto eigen_y = pten::EigenVector<T>::Flatten(y);
-  auto eigen_z = pten::EigenVector<T>::Flatten(*out);
-  auto& place = *dev_ctx.eigen_device();
-  eigen_z.device(place) = eigen_x - eigen_y;
-}
-
-template <typename DevCtx, typename T>
-void ElementwiseMul(const DevCtx& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto eigen_y = pten::EigenVector<T>::Flatten(y);
-  auto eigen_z = pten::EigenVector<T>::Flatten(*out);
-  auto& place = *dev_ctx.eigen_device();
-  eigen_z.device(place) = eigen_x * eigen_y;
-}
-
-}  // namespace eigen
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/general/elementwise_functor.h b/paddle/pten/kernels/hybird/general/elementwise_functor.h
deleted file mode 100644
index 62b422f4ae414..0000000000000
--- a/paddle/pten/kernels/hybird/general/elementwise_functor.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/blas/elementwise.h"
-#include "paddle/pten/kernels/hybird/eigen/elementwise.h"
-
-namespace pten {
-namespace general {
-
-// Define the binary functors used in elementwise ops.
-
-// Add
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsAddFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsAddFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    blas::ElementwiseAdd<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsAddFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    eigen::ElementwiseAdd<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
-};
-template <typename T>
-struct InverseAddFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
-};
-
-// Subtract
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsSubtractFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsSubtractFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    blas::ElementwiseSub<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsSubtractFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    eigen::ElementwiseSub<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename T>
-struct SubtractFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
-};
-template <typename T>
-struct InverseSubtractFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
-};
-
-// Divide
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsDivideFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsDivideFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    paddle::platform::errors::InvalidArgument(
-        "If use SameDimsDivideFunctor, template args(T) must be floating "
-        "point. ");
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsDivideFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    blas::ElementwiseDiv<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-#define DIV_ERROR_INFO                                             \
-  "InvalidArgumentError: Integer division by zero encountered in " \
-  "(floor) divide. Please check the input value."
-
-template <typename T, typename Enable = void>
-struct DivideFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
-};
-
-template <typename T>
-struct DivideFunctor<
-    T,
-    typename std::enable_if<std::is_integral<T>::value>::type> {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
-    // For int32/int64, need to check whether the divison is zero.
-    PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
-    return a / b;
-  }
-};
-
-template <typename T, typename Enable = void>
-struct InverseDivideFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
-};
-
-// Multiply
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsMultiplyFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsMultiplyFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    blas::ElementwiseMul<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsMultiplyFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    eigen::ElementwiseMul<DevCtx, T>(dev_ctx, x, y, z);
-  }
-};
-template <typename T>
-struct MultiplyFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
-};
-template <typename T>
-struct InverseMultiplyFunctor {
-  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
-};
-
-}  // namespace general
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/general/reduce_impl.h b/paddle/pten/kernels/hybird/general/reduce_impl.h
deleted file mode 100644
index 52bdf18ad5a31..0000000000000
--- a/paddle/pten/kernels/hybird/general/reduce_impl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/platform/transform.h"
-#include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/eigen/reduce.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
-namespace pten {
-namespace general {
-
-template <typename DeviceContext, typename T, typename Functor>
-void Reduce(const DeviceContext& dev_ctx,
-            const DenseTensor& x,
-            bool reduce_all,
-            const std::vector<int64_t>& dims,
-            bool keep_dim,
-            DataType out_dtype,
-            DenseTensor* out) {
-  // If the dims has full dim, set the reduce_all is True
-  const auto& input_dim_size = x.dims().size();
-  std::set<int> dims_set(dims.begin(), dims.end());
-  bool full_dim = true;
-  for (auto i = 0; i < input_dim_size; ++i) {
-    if (dims_set.find(i) == dims_set.end()) {
-      full_dim = false;
-      break;
-    }
-  }
-  reduce_all = (reduce_all || full_dim);
-
-  // no need to cast dtype
-  if (out_dtype == pten::DataType::UNDEFINED || out_dtype == x.dtype()) {
-    if (out_dtype == pten::DataType::UNDEFINED) {
-      out_dtype = x.dtype();
-    }
-    // do reduce sum
-    PD_VISIT_ALL_TYPES(
-        out_dtype, "ReduceKernelImpl", ([&] {
-          pten::eigen::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
-              dev_ctx, x, out, dims, keep_dim, reduce_all);
-        }));
-  } else {
-    pten::DenseTensor tmp_tensor = pten::DenseTensor(
-        pten::make_intrusive<paddle::experimental::SharedStorage>(x.place()),
-        pten::DenseTensorMeta(out_dtype, x.dims(), x.layout()));
-
-    // cast x tensor to out_dtype first
-    PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
-                         math::CastKernelImpl<DeviceContext, T, data_t>(
-                             dev_ctx, x, &tmp_tensor);
-                       }));
-
-    // do reduce sum
-    PD_VISIT_ALL_TYPES(
-        out_dtype, "ReduceKernelImpl", ([&] {
-          pten::eigen::ReduceKernelImpl<DeviceContext, T, data_t, Functor>(
-              dev_ctx, tmp_tensor, out, dims, keep_dim, reduce_all);
-        }));
-  }
-}
-
-}  // namespace general
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/math/cast_func.h b/paddle/pten/kernels/hybird/math/cast_func.h
deleted file mode 100644
index 0a67736dbb27b..0000000000000
--- a/paddle/pten/kernels/hybird/math/cast_func.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/platform/transform.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace pten {
-namespace math {
-
-template <typename InT, typename OutT>
-struct CastOpTransformFunctor {
-  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
-};
-
-template <typename DeviceContext, typename InT, typename OutT>
-void CastKernelImpl(const DeviceContext& dev_ctx,
-                    const DenseTensor& x,
-                    DenseTensor* out) {
-  auto* in_begin = x.data<InT>();
-  auto numel = x.numel();
-  auto* in_end = in_begin + numel;
-
-  auto* out_begin = out->mutable_data<OutT>();
-
-  paddle::platform::Transform<DeviceContext> trans;
-  trans(dev_ctx,
-        in_begin,
-        in_end,
-        out_begin,
-        CastOpTransformFunctor<InT, OutT>());
-}
-
-}  // namespace math
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/hybird/transpose.cu b/paddle/pten/kernels/hybird/transpose.cu
index 195277c216fe9..6ea5e36e106d5 100644
--- a/paddle/pten/kernels/hybird/transpose.cu
+++ b/paddle/pten/kernels/hybird/transpose.cu
@@ -16,7 +16,6 @@
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/hybird/math/cast_func.h"
 #include "paddle/pten/kernels/hybird/transpose.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/pten/kernels/hybird/math/conj_impl.h b/paddle/pten/kernels/impl/complex_kernel_impl.h
similarity index 80%
rename from paddle/pten/kernels/hybird/math/conj_impl.h
rename to paddle/pten/kernels/impl/complex_kernel_impl.h
index 84ad0b1a6ce95..6f3a6049faa9a 100644
--- a/paddle/pten/kernels/hybird/math/conj_impl.h
+++ b/paddle/pten/kernels/impl/complex_kernel_impl.h
@@ -14,19 +14,19 @@
 
 #pragma once
 
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/math/complex_functors.h"
-#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace pten {
 
-template <typename T, typename ContextT>
-void ConjImpl(const ContextT& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+template <typename T, typename Context>
+void Conj(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
   auto* out_data = out->mutable_data<T>();
 
-  paddle::platform::ForRange<ContextT> for_range(dev_ctx, numel);
+  paddle::platform::ForRange<Context> for_range(dev_ctx, numel);
   paddle::operators::math::ConjFunctor<T> functor(x_data, numel, out_data);
   for_range(functor);
 }
diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h
index 7076bb51b3621..9be40e22a0360 100644
--- a/paddle/pten/kernels/impl/full_kernel_impl.h
+++ b/paddle/pten/kernels/impl/full_kernel_impl.h
@@ -24,24 +24,26 @@ limitations under the License. */
 
 namespace pten {
 
-template <typename DeviceContext, typename T, typename VType>
-void fill_(const DeviceContext& context, DenseTensor* tensor, VType val) {
+template <typename T, typename Context, typename VType>
+void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) {
   tensor->mutable_data<T>();
   auto t = pten::EigenVector<T>::Flatten(*tensor);
-  t.device(*context.eigen_device()) = t.constant(static_cast<T>(val));
+  t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(val));
 }
 
-template <typename T, typename ContextT>
-void Full(const ContextT& dev_ctx,
-          const ScalarArray& shape,
-          const Scalar& val,
-          DenseTensor* out) {
+template <typename T, typename Context>
+void FullKernel(const Context& dev_ctx,
+                const ScalarArray& shape,
+                const Scalar& val,
+                DenseTensor* out) {
   out->Resize(paddle::framework::make_ddim(shape.GetData()));
-  fill_<ContextT, T>(dev_ctx, out, val.to<T>());
+  FullValue<T>(dev_ctx, out, val.to<T>());
 }
 
-template <typename T, typename ContextT>
-void FullLike(const ContextT& dev_ctx, const Scalar& val, DenseTensor* out) {
+template <typename T, typename Context>
+void FullLikeKernel(const Context& dev_ctx,
+                    const Scalar& val,
+                    DenseTensor* out) {
   auto value = val.to<float>();
   using CommonType = typename std::common_type<
       float,
@@ -66,7 +68,7 @@ void FullLike(const ContextT& dev_ctx, const Scalar& val, DenseTensor* out) {
           static_cast<CommonType>(std::numeric_limits<T>::lowest()),
           static_cast<CommonType>(std::numeric_limits<T>::max()),
           static_cast<float>(value)));
-  fill_<ContextT, T>(dev_ctx, out, value);
+  FullValue<T>(dev_ctx, out, value);
 }
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/hybird/math/matmul_func.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h
similarity index 93%
rename from paddle/pten/kernels/hybird/math/matmul_func.h
rename to paddle/pten/kernels/impl/matmul_kernel_impl.h
index 8aa8750aba418..e50b2f0641a46 100644
--- a/paddle/pten/kernels/hybird/math/matmul_func.h
+++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
-namespace math {
 
 static void GetBroadcastFromDims(const int x_ndim,
                                  const std::int64_t* x_dims,
@@ -86,8 +85,8 @@ static void IndexIncreaseFromDims(const int ndim,
   }
 }
 
-template <typename DeviceContext, typename T>
-void MatMulFunction(const DeviceContext& dev_ctx,
+template <typename Context, typename T>
+void MatMulFunction(const Context& context,
                     const DenseTensor& X,
                     const DenseTensor& Y,
                     const std::vector<std::int64_t>& x_dims,
@@ -103,7 +102,7 @@ void MatMulFunction(const DeviceContext& dev_ctx,
   const T* x_data = X.data<T>();
   const T* y_data = Y.data<T>();
 
-  auto blas = paddle::operators::math::GetBlas<DeviceContext, T>(dev_ctx);
+  auto blas = paddle::operators::math::GetBlas<Context, T>(context);
 
   if (x_ndim == 1 && y_ndim == 1) {
     const int M = X.numel();
@@ -471,8 +470,8 @@ void MatMulFunction(const DeviceContext& dev_ctx,
   }
 }
 
-template <typename DeviceContext, typename T>
-void MatMulFunction(const DeviceContext& dev_ctx,
+template <typename Context, typename T>
+void MatMulFunction(const Context& context,
                     const DenseTensor& X,
                     const DenseTensor& Y,
                     DenseTensor* Out,
@@ -481,9 +480,28 @@ void MatMulFunction(const DeviceContext& dev_ctx,
                     bool flag = false) {
   const std::vector<std::int64_t> x_dims = vectorize(X.dims());
   const std::vector<std::int64_t> y_dims = vectorize(Y.dims());
-  MatMulFunction<DeviceContext, T>(
-      dev_ctx, X, Y, x_dims, y_dims, Out, trans_x, trans_y, flag);
+  MatMulFunction<Context, T>(
+      context, X, Y, x_dims, y_dims, Out, trans_x, trans_y, flag);
+}
+
+template <typename T, typename Context>
+void MatmulKernel(const Context& context,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  bool transpose_x,
+                  bool transpose_y,
+                  DenseTensor* out) {
+  PADDLE_ENFORCE_NE(paddle::framework::product(x.dims()),
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Input(X) dims size must not be equal 0,"
+                        " but reviced dims size is 0. "));
+  PADDLE_ENFORCE_NE(paddle::framework::product(y.dims()),
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Input(Y) dims size must not be equal 0,"
+                        " but reviced dims size is 0. "));
+  MatMulFunction<Context, T>(context, x, y, out, transpose_x, transpose_y);
 }
 
-}  // namespace math
 }  // namespace pten
diff --git a/paddle/pten/kernels/impl/scale_kernel_impl.h b/paddle/pten/kernels/impl/scale_kernel_impl.h
index 421bb9f7b0042..937b3115e63b3 100644
--- a/paddle/pten/kernels/impl/scale_kernel_impl.h
+++ b/paddle/pten/kernels/impl/scale_kernel_impl.h
@@ -23,8 +23,8 @@ limitations under the License. */
 
 namespace pten {
 
-template <typename T, typename ContextT>
-void Scale(const ContextT& dev_ctx,
+template <typename T, typename Context>
+void Scale(const Context& dev_ctx,
            const DenseTensor& x,
            const Scalar& scale,
            float bias,
diff --git a/paddle/pten/kernels/impl/sign_kernel_impl.h b/paddle/pten/kernels/impl/sign_kernel_impl.h
new file mode 100644
index 0000000000000..d663808f03792
--- /dev/null
+++ b/paddle/pten/kernels/impl/sign_kernel_impl.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void Sign(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  out->mutable_data<T>();
+  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+
+  auto& dev = *dev_ctx.eigen_device();
+  paddle::operators::EigenSign<std::decay_t<decltype(dev)>, T>::Eval(
+      dev, eigen_out, eigen_x);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
new file mode 100644
index 0000000000000..f87d0a31b470b
--- /dev/null
+++ b/paddle/pten/kernels/math_kernel.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                bool reduce_all,
+                DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               int axis,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               bool reduce_all,
+               DataType out_dtype,
+               DenseTensor* out);
+
+template <typename T, typename ContextT>
+DenseTensor Add(const ContextT& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  AddKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Subtract(const ContextT& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  SubtractKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Divide(const ContextT& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  DivideKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Multiply(const ContextT& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
+  pten::DenseTensor dense_out(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          dev_ctx.GetPlace()),
+      std::move(out_meta));
+  MultiplyKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Mean(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const std::vector<int64_t>& axis,
+                 bool keep_dim) {
+  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  bool reduce_all = false;
+  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename Context>
+DenseTensor Sum(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& axis,
+                DataType dtype,
+                bool keep_dim) {
+  auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+
+  // The real value of reduce_all will be get in kernel
+  // so use default value(false) is OK.
+  bool reduce_all = false;
+
+  SumKernel<T, Context>(
+      dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/include/creation.h b/paddle/pten/kernels/matmul_kernel.h
similarity index 50%
rename from paddle/pten/include/creation.h
rename to paddle/pten/kernels/matmul_kernel.h
index d685d262ebc1c..fb54a5301e61c 100644
--- a/paddle/pten/include/creation.h
+++ b/paddle/pten/kernels/matmul_kernel.h
@@ -15,27 +15,31 @@
 #pragma once
 
 #include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/full_kernel.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/infermeta/binary.h"
 
 namespace pten {
 
-// TODO(YuanRisheng) This function name should be same as User API name.
-// TODO(zyfncg) Automatic code generation
-template <typename T, typename ContextT>
-DenseTensor FullLike(
-    const ContextT& dev_ctx,
-    const DenseTensor& x,
-    const Scalar& val,
-    DataType dtype = DataType::UNDEFINED,
-    Backend backend = Backend::UNDEFINED,  // Is backend needed here?
-    DataLayout layout = DataLayout::UNDEFINED) {
-  auto out_meta = FullLikeInferMeta(x.meta(), dtype, layout);
-  pten::DenseTensor dense_out(
+template <typename T, typename Context>
+void MatmulKernel(const Context& context,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  bool transpose_x,
+                  bool transpose_y,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Matmul(const Context& context,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   bool transpose_x,
+                   bool transpose_y) {
+  auto out_meta = MatmulInferMeta(x.meta(), y.meta(), transpose_x, transpose_y);
+  DenseTensor dense_out(
       pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
+          context.GetPlace()),
       std::move(out_meta));
-  FullLike<T, ContextT>(dev_ctx, val, &dense_out);
+  MatmulKernel<T, Context>(context, x, y, transpose_x, transpose_y, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/kernels/npu/CMakeLists.txt b/paddle/pten/kernels/npu/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/pten/kernels/reshape_kernel.cc b/paddle/pten/kernels/reshape_kernel.cc
new file mode 100644
index 0000000000000..d7e2e2707ee1b
--- /dev/null
+++ b/paddle/pten/kernels/reshape_kernel.cc
@@ -0,0 +1,86 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/reshape_kernel.h"
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+#include "paddle/pten/kernels/funcs/common_shape.h"
+
+namespace pten {
+
+template <typename Context>
+void ReshapeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const ScalarArray& shape,
+                   DenseTensor* out) {
+  auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData());
+  if (x.data() == out->data() && x.numel() == out->numel()) {
+    out->Resize(out_meta.dims);
+    return;
+  }
+  pten::Copy(dev_ctx, x, false, out);
+  out->Resize(out_meta.dims);
+  out->ResetLoD(x.lod());
+}
+
+template <typename Context>
+void ReshapeWithXShape(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const ScalarArray& shape,
+                       DenseTensor* xshape,
+                       DenseTensor* out) {
+  funcs::SetXShape(x, xshape);
+  ReshapeKernel(dev_ctx, x, shape, out);
+}
+
+}  // namespace pten
+
+PT_REGISTER_GENERAL_KERNEL(reshape,
+                           CPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeKernel<pten::CPUContext>,
+                           ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+                           CPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeWithXShape<pten::CPUContext>,
+                           ALL_DTYPE) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_GENERAL_KERNEL(reshape,
+                           GPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeKernel<pten::GPUContext>,
+                           ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+                           GPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeWithXShape<pten::GPUContext>,
+                           ALL_DTYPE) {}
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_REGISTER_GENERAL_KERNEL(reshape,
+                           XPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeKernel<pten::XPUContext>,
+                           ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+                           XPU,
+                           ALL_LAYOUT,
+                           pten::ReshapeWithXShape<pten::XPUContext>,
+                           ALL_DTYPE) {}
+#endif
diff --git a/paddle/pten/kernels/cpu/manipulation.h b/paddle/pten/kernels/reshape_kernel.h
similarity index 54%
rename from paddle/pten/kernels/cpu/manipulation.h
rename to paddle/pten/kernels/reshape_kernel.h
index 1a219dc79e601..faa51c69ad17c 100644
--- a/paddle/pten/kernels/cpu/manipulation.h
+++ b/paddle/pten/kernels/reshape_kernel.h
@@ -14,36 +14,34 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
 
-template <typename T>
-void Flatten(const CPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out);
-
-template <typename T>
-void Cast(const CPUContext& dev_ctx,
-          const DenseTensor& x,
-          DataType out_dtype,
-          DataType in_dtype,
-          DenseTensor* out);
-
-void Reshape(const CPUContext& dev_ctx,
-             const DenseTensor& x,
-             const ScalarArray& shape,
-             DenseTensor* out);
-
-void ReshapeWithXShape(const CPUContext& dev_ctx,
+template <typename Context>
+void ReshapeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const ScalarArray& shape,
+                   DenseTensor* out);
+
+template <typename Context>
+void ReshapeWithXShape(const Context& dev_ctx,
                        const DenseTensor& x,
                        const ScalarArray& shape,
                        DenseTensor* xshape,
                        DenseTensor* out);
 
+template <typename T, typename Context>
+DenseTensor Reshape(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const std::vector<int64_t>& shape) {
+  auto out_meta = InferMetaFromVecValue(x.meta(), shape);
+  auto dense_out = Empty<T, Context>(dev_ctx, std::move(out_meta));
+  ReshapeKernel<Context>(dev_ctx, x, ScalarArray(shape), &dense_out);
+  return dense_out;
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h
index bb3c1968fce9e..5908050029c7a 100644
--- a/paddle/pten/kernels/scale_kernel.h
+++ b/paddle/pten/kernels/scale_kernel.h
@@ -19,8 +19,8 @@ limitations under the License. */
 
 namespace pten {
 
-template <typename T, typename ContextT>
-void Scale(const ContextT& dev_ctx,
+template <typename T, typename Context>
+void Scale(const Context& dev_ctx,
            const DenseTensor& x,
            const Scalar& scale,
            float bias,
diff --git a/paddle/pten/backends/npu/npu_context.h b/paddle/pten/kernels/sign_kernel.h
similarity index 74%
rename from paddle/pten/backends/npu/npu_context.h
rename to paddle/pten/kernels/sign_kernel.h
index bb17a1bea6df3..2cf5ca973f093 100644
--- a/paddle/pten/backends/npu/npu_context.h
+++ b/paddle/pten/kernels/sign_kernel.h
@@ -14,13 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND_CL
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 namespace pten {
-using NPUContext = paddle::platform::NPUDeviceContext;
-}  // namespace pten
 
-#endif  // PADDLE_WITH_ASCEND_CL
+template <typename T, typename Context>
+void Sign(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/xpu/CMakeLists.txt b/paddle/pten/kernels/xpu/CMakeLists.txt
index 3ba070bdd6c96..e69de29bb2d1d 100644
--- a/paddle/pten/kernels/xpu/CMakeLists.txt
+++ b/paddle/pten/kernels/xpu/CMakeLists.txt
@@ -1,2 +0,0 @@
-cc_library(utils_xpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
-cc_library(manipulation_xpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_xpu unary)
diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/copy_kernel.cc
similarity index 89%
rename from paddle/pten/kernels/xpu/utils.cc
rename to paddle/pten/kernels/xpu/copy_kernel.cc
index 5ea3a359ef6d6..190eb39e22ecd 100644
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
@@ -12,14 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/kernels/xpu/utils.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/pten/kernels/copy_kernel.h"
+
+#include "paddle/pten/backends/xpu/xpu_context.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace pten {
 
-void Copy(const XPUDeviceContext& dev_ctx,
+template <typename Context>
+void Copy(const Context& dev_ctx,
           const DenseTensor& src,
           bool blocking,
           DenseTensor* dst) {
@@ -76,4 +82,5 @@ void Copy(const XPUDeviceContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_NO_TEMPLATE_KERNEL(copy, XPU, ALL_LAYOUT, pten::Copy, ALL_DTYPE) {}
+PT_REGISTER_GENERAL_KERNEL(
+    copy, XPU, ALL_LAYOUT, pten::Copy<pten::XPUContext>, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc
deleted file mode 100644
index 70ac70371e90a..0000000000000
--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/pten/kernels/xpu/manipulation.h"
-#include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/hybird/general/manipulation.h"
-#include "paddle/pten/kernels/xpu/utils.h"
-
-namespace pten {
-
-template <typename T>
-void Flatten(const XPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out) {
-  auto out_dims = out->dims();
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_dims);
-}
-
-// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
-// Output Tensor，
-// is there a more flexible way to deal with this case?
-template <typename T>
-void FlattenWithXShape(const XPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       int start_axis,
-                       int stop_axis,
-                       DenseTensor* out,
-                       DenseTensor* xshape) {
-  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
-  const auto& in_dims = x.dims();
-  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
-  xshape_dims[0] = 0;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    xshape_dims[i + 1] = in_dims[i];
-  }
-  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
-  xshape->ResetLoD(x.lod());
-}
-
-void Reshape(const XPUContext& dev_ctx,
-             const DenseTensor& x,
-             const ScalarArray& shape,
-             DenseTensor* out) {
-  auto out_meta = InferMetaFromVecValue(x.meta(), shape.GetData());
-  if (x.data() == out->data() && x.numel() == out->numel()) {
-    out->Resize(out_meta.dims);
-    return;
-  }
-  pten::Copy(dev_ctx, x, false, out);
-  out->Resize(out_meta.dims);
-  out->ResetLoD(x.lod());
-}
-
-void ReshapeWithXShape(const XPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       const ScalarArray& shape,
-                       DenseTensor* xshape,
-                       DenseTensor* out) {
-  general::SetXShape(x, xshape);
-  Reshape(dev_ctx, x, shape, out);
-}
-
-}  // namespace pten
-
-PT_REGISTER_KERNEL(flatten,
-                   XPU,
-                   ALL_LAYOUT,
-                   pten::Flatten,
-                   float,
-                   paddle::platform::float16,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-
-PT_REGISTER_KERNEL(flatten_with_xshape,
-                   XPU,
-                   ALL_LAYOUT,
-                   pten::FlattenWithXShape,
-                   float,
-                   paddle::platform::float16,
-                   double,
-                   uint8_t,
-                   int8_t,
-                   int,
-                   int64_t) {}
-
-PT_REGISTER_NO_TEMPLATE_KERNEL(
-    reshape, XPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
diff --git a/paddle/pten/kernels/xpu/manipulation.h b/paddle/pten/kernels/xpu/manipulation.h
deleted file mode 100644
index 0b68ae419518e..0000000000000
--- a/paddle/pten/kernels/xpu/manipulation.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/pten/backends/xpu/xpu_context.h"
-#include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_registry.h"
-
-namespace pten {
-
-template <typename T>
-void Flatten(const XPUContext& dev_ctx,
-             const DenseTensor& x,
-             int start_axis,
-             int stop_axis,
-             DenseTensor* out);
-
-void Reshape(const XPUContext& dev_ctx,
-             const DenseTensor& x,
-             const ScalarArray& shape,
-             DenseTensor* out);
-
-void ReshapeWithXShape(const XPUContext& dev_ctx,
-                       const DenseTensor& x,
-                       const ScalarArray& shape,
-                       DenseTensor* xshape,
-                       DenseTensor* out);
-
-}  // namespace pten
-
-#endif
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index e85eb4c3294f1..bb1eab2c09551 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -12,6 +12,7 @@ cc_test(test_framework_place_utils storage SRCS test_place_utils.cc DEPS pten_ap
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_empty_api SRCS test_empty_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_empty_api.cc b/paddle/pten/tests/api/test_empty_api.cc
new file mode 100644
index 0000000000000..fcc01ad8a7172
--- /dev/null
+++ b/paddle/pten/tests/api/test_empty_api.cc
@@ -0,0 +1,127 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/api.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace paddle {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
+TEST(API, empty_like) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
+
+  paddle::experimental::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = paddle::experimental::empty_like(x, pten::DataType::FLOAT32);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+}
+
+TEST(API, empty1) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  auto dense_shape = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::INT64,
+                            framework::make_ddim({2}),
+                            pten::DataLayout::NCHW));
+  auto* shape_data = dense_shape->mutable_data<int64_t>();
+  shape_data[0] = 2;
+  shape_data[1] = 3;
+
+  paddle::experimental::Tensor tensor_shape(dense_shape);
+
+  // 2. test API
+  auto out = paddle::experimental::empty(tensor_shape, pten::DataType::FLOAT32);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2UL);
+  ASSERT_EQ(out.shape()[0], 2);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+}
+
+TEST(API, empty2) {
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  auto dense_scalar = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::INT32,
+                            framework::make_ddim({1}),
+                            pten::DataLayout::NCHW));
+  dense_scalar->mutable_data<int32_t>()[0] = 2;
+
+  paddle::experimental::Tensor shape_scalar1(dense_scalar);
+  paddle::experimental::Tensor shape_scalar2(dense_scalar);
+  std::vector<paddle::experimental::Tensor> list_shape{shape_scalar1,
+                                                       shape_scalar2};
+
+  auto out = paddle::experimental::empty(list_shape, pten::DataType::FLOAT32);
+
+  ASSERT_EQ(out.shape().size(), 2UL);
+  ASSERT_EQ(out.shape()[0], 2);
+  ASSERT_EQ(out.numel(), 4);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+}
+
+TEST(API, empty3) {
+  std::vector<int64_t> vector_shape{2, 3};
+
+  auto out = paddle::experimental::empty(vector_shape, pten::DataType::INT32);
+
+  ASSERT_EQ(out.shape().size(), 2UL);
+  ASSERT_EQ(out.shape()[0], 2);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::INT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+}
+
+}  // namespace tests
+}  // namespace paddle
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index e29fa11d58d1d..bef0e2af4cf92 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/gpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/pten/tests/api/test_pten_tensor.cc b/paddle/pten/tests/api/test_pten_tensor.cc
index bffc1b8d89fe0..a28f7ca2ca2e6 100644
--- a/paddle/pten/tests/api/test_pten_tensor.cc
+++ b/paddle/pten/tests/api/test_pten_tensor.cc
@@ -205,6 +205,11 @@ void TestInitilized() {
   }
 }
 
+void TestJudgeTensorType() {
+  experimental::Tensor test_tensor(paddle::PlaceType::kCPU, {1, 1});
+  CHECK(test_tensor.is_dense_tensor() == true);
+}
+
 TEST(PtenTensor, All) {
   VLOG(2) << "TestCopy";
   GroupTestCopy();
@@ -220,6 +225,8 @@ TEST(PtenTensor, All) {
   GroupTestCast();
   VLOG(2) << "TestInitilized";
   TestInitilized();
+  VLOG(2) << "TestJudgeTensorType";
+  TestJudgeTensorType();
 }
 
 }  // namespace tests
diff --git a/paddle/pten/tests/api/test_tensor_utils.cc b/paddle/pten/tests/api/test_tensor_utils.cc
index b59cee5dc7e84..041bd28ad892a 100644
--- a/paddle/pten/tests/api/test_tensor_utils.cc
+++ b/paddle/pten/tests/api/test_tensor_utils.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/core/tensor_meta.h"
 
 namespace paddle {
 namespace tests {
@@ -30,7 +31,7 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) {
   const DDim dims({2, 1});
   const DataType dtype{DataType::FLOAT32};
   const DataLayout layout{DataLayout::NCHW};
-  const std::vector<std::vector<size_t>> lod{{0, 2}};
+  const pten::LoD lod{{0, 2}};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
   auto alloc =
@@ -46,10 +47,9 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) {
 
   CHECK(dense_tensor.lod().size() == lod_tensor.lod().size());
   CHECK(dense_tensor.lod()[0] ==
-        static_cast<std::vector<size_t>>((lod_tensor.lod()[0])));
+        static_cast<paddle::framework::Vector<size_t>>((lod_tensor.lod()[0])));
   CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(lod_tensor.type()));
-  CHECK(dense_tensor.layout() ==
-        pten::TransToPtenDataLayout(lod_tensor.layout()));
+  CHECK(dense_tensor.layout() == lod_tensor.layout());
   CHECK(platform::is_cpu_place(lod_tensor.place()));
 
   CHECK(lod_tensor.data<float>()[0] == 1.0f);
@@ -84,7 +84,7 @@ TEST(tensor_utils, dense_tensor_to_tensor) {
   experimental::MovesStorage(&dense_tensor, &tensor);
 
   CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(tensor.type()));
-  CHECK(dense_tensor.layout() == pten::TransToPtenDataLayout(tensor.layout()));
+  CHECK(dense_tensor.layout() == tensor.layout());
   CHECK(platform::is_cpu_place(tensor.place()));
 
   CHECK(tensor.data<float>()[0] == 1.0f);
diff --git a/paddle/pten/tests/common/test_data_layout.cc b/paddle/pten/tests/common/test_data_layout.cc
index 66b3e34753896..4e0bb597131b5 100644
--- a/paddle/pten/tests/common/test_data_layout.cc
+++ b/paddle/pten/tests/common/test_data_layout.cc
@@ -25,10 +25,10 @@ namespace tests {
 TEST(DataLayout, OStream) {
   std::ostringstream oss;
   oss << pten::DataLayout::UNDEFINED;
-  EXPECT_EQ(oss.str(), "Undefined");
+  EXPECT_EQ(oss.str(), "Undefined(AnyLayout)");
   oss.str("");
   oss << pten::DataLayout::ANY;
-  EXPECT_EQ(oss.str(), "Undefined");
+  EXPECT_EQ(oss.str(), "Undefined(AnyLayout)");
   oss.str("");
   oss << pten::DataLayout::NHWC;
   EXPECT_EQ(oss.str(), "NHWC");
@@ -43,8 +43,7 @@ TEST(DataLayout, OStream) {
     oss << pten::DataLayout::NUM_DATA_LAYOUTS;
   } catch (const std::exception& exception) {
     std::string ex_msg = exception.what();
-    EXPECT_TRUE(ex_msg.find("Invalid enum data layout type") !=
-                std::string::npos);
+    EXPECT_TRUE(ex_msg.find("Unknown Data Layout type") != std::string::npos);
   }
 }
 
diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h
index 053e8ba7b382b..094c0e8437d98 100644
--- a/paddle/pten/tests/core/allocator.h
+++ b/paddle/pten/tests/core/allocator.h
@@ -38,12 +38,18 @@ class HostAllocatorSample : public pten::RawAllocator {
 
 class FancyAllocator : public pten::Allocator {
  public:
-  static void Delete(void* data) { ::operator delete(data); }
+  static void Delete(Allocation* allocation) {
+    ::operator delete(allocation->ptr());
+  }
 
   Allocation Allocate(size_t bytes_size) override {
     void* data = ::operator new(bytes_size);
-    return Allocation(data, data, &Delete, paddle::platform::CPUPlace());
+    return Allocation(data, data, &Delete, place());
   }
+
+  const paddle::platform::Place& place() override { return place_; }
+
+  paddle::platform::Place place_ = paddle::platform::CPUPlace();
 };
 
 template <typename T>
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index 2879a429d9b82..c6db228c2b757 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -25,7 +25,7 @@ TEST(dense_tensor, meta) {
   const DataType dtype{DataType::INT8};
   const DataLayout layout{DataLayout::NHWC};
   // TODO(Shixiaowei02): need to check the lod is valid.
-  const std::vector<std::vector<size_t>> lod{};
+  const LoD lod{};
 
   DenseTensorMeta meta_0;
   CHECK(!meta_0.valid());
@@ -65,14 +65,14 @@ TEST(dense_tensor, meta) {
 
 TEST(dense_tensor, def_ctor) {
   DenseTensor tensor_0;
-  CHECK(!tensor_0.valid());
+  CHECK(tensor_0.valid());
 }
 
 TEST(dense_tensor, ctor) {
   const DDim dims({1, 2});
   const DataType dtype{DataType::INT8};
   const DataLayout layout{DataLayout::NHWC};
-  const std::vector<std::vector<size_t>> lod{};
+  const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
   auto alloc = std::make_shared<FancyAllocator>();
@@ -97,7 +97,6 @@ TEST(dense_tensor, ctor) {
   check_dense_tensor(tensor_0, meta);
 
   DenseTensor tensor_2(make_intrusive<TensorStorage>(alloc), meta);
-  CHECK(tensor_2.data<int8_t>() == nullptr);
   CHECK_NOTNULL(tensor_2.mutable_data<int8_t>());
   check_dense_tensor(tensor_2, meta);
 }
@@ -106,7 +105,7 @@ TEST(dense_tensor, resize) {
   const DDim dims({1, 2});
   const DataType dtype{DataType::INT8};
   const DataLayout layout{DataLayout::NHWC};
-  const std::vector<std::vector<size_t>> lod{};
+  const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
   auto alloc = std::make_shared<FancyAllocator>();
@@ -122,5 +121,23 @@ TEST(dense_tensor, resize) {
   CHECK_EQ(storage->size(), 6u);
 }
 
+TEST(dense_tensor, shallow_copy) {
+  const DDim dims({1, 2});
+  const DataType dtype{DataType::INT8};
+  const DataLayout layout{DataLayout::NHWC};
+  const LoD lod{};
+  DenseTensorMeta meta(dtype, dims, layout, lod);
+
+  auto alloc = std::make_shared<FancyAllocator>();
+  DenseTensor tensor_0(alloc, meta);
+
+  DenseTensor tensor_1(tensor_0);
+  CHECK(tensor_0.meta() == tensor_1.meta());
+
+  // Copy constructor: Now shares the underlying shared_ptr<Allocation> instead
+  // of Storage
+  CHECK(tensor_0.release() != tensor_1.release());
+}
+
 }  // namespace tests
 }  // namespace pten
diff --git a/paddle/pten/tests/kernels/CMakeLists.txt b/paddle/pten/tests/kernels/CMakeLists.txt
index 3a626aad2deb5..6f70f2ca2c895 100644
--- a/paddle/pten/tests/kernels/CMakeLists.txt
+++ b/paddle/pten/tests/kernels/CMakeLists.txt
@@ -1,7 +1,8 @@
 cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS pten pten_api_utils)
-cc_test(test_fill_dev_api SRCS test_fill_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_creation_dev_api SRCS test_creation_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_matmul_dev_api SRCS test_matmul_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS pten pten_api_utils)
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index 5bbaf2a2c373d..cb45d827e3be9 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/cast_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/common/data_type.h"
@@ -49,13 +49,11 @@ TEST(DEV_API, cast) {
   auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
 
   pten::DataType out_dtype = pten::DataType::FLOAT64;
-  pten::DataType in_dtype = pten::DataType::FLOAT32;
   // 2. test API
   auto out = pten::Cast<float>(
       *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
       dense_x,
-      out_dtype,
-      in_dtype);
+      out_dtype);
 
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index 9cc994c569553..3095c83d97c98 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/copy_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -28,8 +28,7 @@ namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
 // TODO(YuanRisheng): This TEST file need to be refactored after 'copy' realized
-// in
-// 'paddle/api',
+// in 'paddle/api'
 TEST(DEV_API, copy) {
   // 1. create tensor
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
new file mode 100644
index 0000000000000..4d753f7d09b8e
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -0,0 +1,142 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/kernels/empty_kernel.h"
+#include "paddle/pten/kernels/full_kernel.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, empty) {
+  // 1. create input
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Empty<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      {3, 2},
+      pten::DataType::INT32);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::INT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+}
+
+TEST(DEV_API, empty_like) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 2}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  dense_x_data[0] = 0;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::EmptyLike<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)), dense_x);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+}
+
+TEST(DEV_API, full) {
+  // 1. create input
+  float val = 1.0;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Full<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      {3, 2},
+      val,
+      pten::DataType::FLOAT32);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto* actual_result = out.data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  }
+}
+
+TEST(DEV_API, full_like) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 2}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  dense_x_data[0] = 0;
+  float val = 1.0;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::FullLike<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      val);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto* actual_result = out.data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  }
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index f12a2d48e6b2b..bd09ecb770a5d 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index a351be3cf664a..f18e5c050ba70 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -15,12 +15,22 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/flatten_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
+PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
+#endif
+
 namespace pten {
 namespace tests {
 
diff --git a/paddle/pten/tests/kernels/test_fill_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
similarity index 53%
rename from paddle/pten/tests/kernels/test_fill_dev_api.cc
rename to paddle/pten/tests/kernels/test_matmul_dev_api.cc
index 9a8b1f94e731b..7ac3d19554581 100644
--- a/paddle/pten/tests/kernels/test_fill_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/creation.h"
+#include "paddle/pten/kernels/matmul_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -27,38 +27,48 @@ namespace tests {
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
-TEST(DEV_API, fill_any_like) {
+TEST(DEV_API, dot) {
   // 1. create tensor
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
-                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
-                                                  framework::make_ddim({3, 2}),
-                                                  pten::DataLayout::NCHW));
+  DenseTensor dense_x(alloc,
+                      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                            framework::make_ddim({3, 3}),
+                                            pten::DataLayout::NCHW));
+
   auto* dense_x_data = dense_x.mutable_data<float>();
-  dense_x_data[0] = 0;
-  float val = 1.0;
+
+  DenseTensor dense_y(alloc,
+                      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                            framework::make_ddim({3, 3}),
+                                            pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y.mutable_data<float>();
+
+  for (size_t i = 0; i < 9; ++i) {
+    dense_x_data[i] = 1.0;
+    dense_y_data[i] = 2.0;
+  }
+  std::vector<float> sum(9, 6.0);
 
   paddle::platform::DeviceContextPool& pool =
       paddle::platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+  auto* ctx = pool.Get(paddle::platform::CPUPlace());
 
   // 2. test API
-  auto out = pten::FullLike<float>(
-      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
-      dense_x,
-      val);
+  auto out = Matmul<float, CPUContext>(
+      *(static_cast<CPUContext*>(ctx)), dense_x, dense_y, false, false);
 
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
   ASSERT_EQ(out.dims()[0], 3);
-  ASSERT_EQ(out.numel(), 6);
-  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
-  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+  ASSERT_EQ(out.dtype(), DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
 
-  auto* actual_result = out.data<float>();
-  for (auto i = 0; i < 6; i++) {
-    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  for (size_t i = 0; i < 9; i++) {
+    ASSERT_NEAR(sum[i], out.data<float>()[i], 1e-6f);
   }
 }
 
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 4d062977e23bd..4b254e7e6c1ac 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index 64efdc6f67201..0196e1c211004 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/manipulation.h"
+#include "paddle/pten/kernels/reshape_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index 381b8fe44f532..afaf903063781 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/math_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/scripts/docker/root/.bashrc b/paddle/scripts/docker/root/.bashrc
index 2f9c2a9fe92c4..984427434b9d9 100755
--- a/paddle/scripts/docker/root/.bashrc
+++ b/paddle/scripts/docker/root/.bashrc
@@ -37,5 +37,4 @@ NO_COLOUR="\[\033[0m\]"
 GREEN="\[\033[1;32m\]"
 WHITE="\[\033[1;37m\]"
 
-export PS1="\[\033[1;33m\]λ $WHITE\h $GREEN\w$YELLOW\$(__git_ps1 \" \[\033[35m\]{\[\033[36m\]%s\[\033[35m\]}\")$NO_COLOUR "
-
+export PS1="\[\033[1;33m\]λ $WHITE\h $GREEN\w $NO_COLOUR"
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 0e386ef950616..74f690da7638f 100644
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -65,13 +65,12 @@ function infrt_gen_and_build() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     rm -f infrt_summary.txt
-    cmake ..  -DWITH_MKL=OFF -DWITH_GPU=OFF -DCMAKE_BUILD_TYPE=Release -DWITH_INFRT=ON -DWITH_TESTING==${WITH_TESTING:-ON}; build_error=$?
+    cmake ..  -DWITH_MKL=OFF -DWITH_GPU=OFF -DWITH_CRYPTO=OFF -DCMAKE_BUILD_TYPE=Release -DWITH_INFRT=ON -DWITH_PYTHON=OFF -DWITH_TESTING==${WITH_TESTING:-ON}; build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
 
-    make -j ${parallel_number} infrt infrtopt infrt-exec;build_error=$?
-    make -j ${parallel_number} infrt_lib_dist;build_error=$?
+    make -j ${parallel_number} infrt infrtopt infrt-exec test_infrt_exec trt-exec infrt_lib_dist;build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
     fi
@@ -113,6 +112,12 @@ function main() {
         infrt_gen_and_build ${parallel_number}
         test_infrt
         ;;
+      build_only)
+        infrt_gen_and_build ${parallel_number}
+        ;;
+      test_only)
+        test_infrt
+        ;;
       *)
         print_usage
         exit 1
@@ -124,9 +129,9 @@ function main() {
         cat ${PADDLE_ROOT}/build/infrt_summary.txt
         echo "========================================================"
       fi
-      echo "paddle_build script finished as expected"
+      echo "paddle_build script finished as expected!"
 }
 
 main $@
 
-rm -rf tmp_dir
+rm -rf $tmp_dir
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index c58c78995e5ff..afa0011858987 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -236,6 +236,7 @@ function cmake_base() {
         -DON_INFER=${ON_INFER:-OFF}
         -DWITH_HETERPS=${WITH_HETERPS:-OFF}
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} 
+        -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}"
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -283,6 +284,7 @@ EOF
         -DON_INFER=${ON_INFER:-OFF} \
         -DWITH_HETERPS=${WITH_HETERPS:-OFF} \
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
+        -DCUDA_ARCH_BIN="${CUDA_ARCH_BIN}" \
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
@@ -575,7 +577,7 @@ EOF
         export http_proxy=
         export https_proxy=
         set -x
-
+        
         set +ex
         if [ "$1" == "cp36-cp36m" ]; then
             pip3.6 uninstall -y paddlepaddle
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 35736d3923c50..9b657fe667f78 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -6,10 +6,14 @@ set(PY_FILES paddle/__init__.py
 
 if(WITH_GPU)
   SET(PACKAGE_NAME "paddlepaddle-gpu")
+elseif(WITH_MLU)
+  SET(PACKAGE_NAME "paddlepaddle-mlu")
 elseif(WITH_ROCM)
   SET(PACKAGE_NAME "paddlepaddle-rocm")
 elseif(WITH_ASCEND_CL)
   SET(PACKAGE_NAME "paddlepaddle-npu")
+elseif(WITH_XPU)
+  SET(PACKAGE_NAME "paddlepaddle-xpu")
 else()
   SET(PACKAGE_NAME "paddlepaddle")
 endif()
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 3e808262d5dd0..8ce9716b169b9 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -64,14 +64,13 @@
 import paddle.static  # noqa: F401
 import paddle.vision  # noqa: F401
 
-from .tensor.random import bernoulli  # noqa: F401
-
 from .tensor.attribute import is_complex  # noqa: F401
 from .tensor.attribute import is_integer  # noqa: F401
 from .tensor.attribute import rank  # noqa: F401
 from .tensor.attribute import shape  # noqa: F401
 from .tensor.attribute import real  # noqa: F401
 from .tensor.attribute import imag  # noqa: F401
+from .tensor.attribute import is_floating_point  # noqa: F401
 from .tensor.creation import to_tensor  # noqa: F401
 from .tensor.creation import diag  # noqa: F401
 from .tensor.creation import diagflat  # noqa: F401
@@ -91,6 +90,7 @@
 from .tensor.creation import empty_like  # noqa: F401
 from .tensor.creation import assign  # noqa: F401
 from .tensor.creation import complex  # noqa: F401
+from .tensor.creation import clone  # noqa: F401
 from .tensor.linalg import matmul  # noqa: F401
 from .tensor.linalg import dot  # noqa: F401
 from .tensor.linalg import norm  # noqa: F401
@@ -158,6 +158,8 @@
 from .tensor.manipulation import roll  # noqa: F401
 from .tensor.manipulation import chunk  # noqa: F401
 from .tensor.manipulation import tolist  # noqa: F401
+from .tensor.manipulation import take_along_axis  # noqa: F401
+from .tensor.manipulation import put_along_axis  # noqa: F401
 from .tensor.manipulation import tensordot  # noqa: F401
 from .tensor.manipulation import as_complex  # noqa: F401
 from .tensor.manipulation import as_real  # noqa: F401
@@ -197,13 +199,16 @@
 from .tensor.math import square  # noqa: F401
 from .tensor.math import stanh  # noqa: F401
 from .tensor.math import sum  # noqa: F401
+from .tensor.math import nansum  # noqa: F401
 from .tensor.math import tanh  # noqa: F401
 from .tensor.math import tanh_  # noqa: F401
 from .tensor.math import add_n  # noqa: F401
 from .tensor.math import max  # noqa: F401
 from .tensor.math import maximum  # noqa: F401
+from .tensor.math import amax  # noqa: F401
 from .tensor.math import min  # noqa: F401
 from .tensor.math import minimum  # noqa: F401
+from .tensor.math import amin  # noqa: F401
 from .tensor.math import mm  # noqa: F401
 from .tensor.math import divide  # noqa: F401
 from .tensor.math import floor_divide  # noqa: F401
@@ -211,6 +216,7 @@
 from .tensor.math import mod  # noqa: F401
 from .tensor.math import floor_mod  # noqa: F401
 from .tensor.math import multiply  # noqa: F401
+from .tensor.math import renorm  # noqa: F401
 from .tensor.math import add  # noqa: F401
 from .tensor.math import subtract  # noqa: F401
 from .tensor.math import logsumexp  # noqa: F401
@@ -236,6 +242,7 @@
 from .tensor.math import asinh  # noqa: F401
 from .tensor.math import atanh  # noqa: F401
 from .tensor.math import lerp  # noqa: F401
+from .tensor.math import erfinv  # noqa: F401
 from .tensor.math import rad2deg  # noqa: F401
 from .tensor.math import deg2rad  # noqa: F401
 from .tensor.math import gcd  # noqa: F401
@@ -244,7 +251,11 @@
 from .tensor.math import angle  # noqa: F401
 from .tensor.math import fmax  # noqa: F401
 from .tensor.math import fmin  # noqa: F401
+from .tensor.math import inner  # noqa: F401
+from .tensor.math import outer  # noqa: F401
 
+from .tensor.random import bernoulli  # noqa: F401
+from .tensor.random import poisson  # noqa: F401
 from .tensor.random import multinomial  # noqa: F401
 from .tensor.random import standard_normal  # noqa: F401
 from .tensor.random import normal  # noqa: F401
@@ -264,6 +275,8 @@
 from .tensor.search import index_select  # noqa: F401
 from .tensor.search import nonzero  # noqa: F401
 from .tensor.search import sort  # noqa: F401
+from .tensor.search import kthvalue  # noqa: F401
+from .tensor.search import mode  # noqa: F401
 
 from .tensor.to_string import set_printoptions  # noqa: F401
 
@@ -279,10 +292,12 @@
 from .framework import CUDAPlace  # noqa: F401
 from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
+from .framework import MLUPlace  # noqa: F401
 
 from .autograd import grad  # noqa: F401
 from .autograd import no_grad  # noqa: F401
 from .autograd import set_grad_enabled  # noqa: F401
+from .autograd import is_grad_enabled  # noqa: F401
 from .framework import save  # noqa: F401
 from .framework import load  # noqa: F401
 from .framework import DataParallel  # noqa: F401
@@ -296,6 +311,7 @@
 from .tensor.stat import var  # noqa: F401
 from .tensor.stat import numel  # noqa: F401
 from .tensor.stat import median  # noqa: F401
+from .tensor.stat import quantile  # noqa: F401
 from .device import get_cudnn_version  # noqa: F401
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
@@ -308,6 +324,7 @@
 from .device import is_compiled_with_xpu  # noqa: F401
 from .device import is_compiled_with_npu  # noqa: F401
 from .device import is_compiled_with_ipu  # noqa: F401
+from .device import is_compiled_with_mlu  # noqa: F401
 from .device import XPUPlace  # noqa: F401
 
 from .fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
@@ -389,9 +406,11 @@
            'cos',
            'tan',
            'mean',
+           'mode',
            'mv',
            'in_dynamic_mode',
            'min',
+           'amin',
            'any',
            'slice',
            'normal',
@@ -434,6 +453,7 @@
            'roll',
            'batch',
            'max',
+           'amax',
            'logical_or',
            'bitwise_and',
            'bitwise_or',
@@ -451,6 +471,7 @@
            'shape',
            'real',
            'imag',
+           'is_floating_point',
            'complex',
            'reciprocal',
            'rand',
@@ -464,8 +485,10 @@
            'load',
            'numel',
            'median',
+           'quantile',
            'no_grad',
            'set_grad_enabled',
+           'is_grad_enabled',
            'mod',
            'abs',
            'tril',
@@ -482,6 +505,7 @@
            'exp',
            'expm1',
            'bernoulli',
+           'poisson',
            'sinh',
            'round',
            'DataParallel',
@@ -492,6 +516,9 @@
            'neg',
            'lgamma',
            'lerp',
+           'erfinv',
+           'inner',
+           'outer',
            'square',
            'divide',
            'ceil',
@@ -511,6 +538,7 @@
            'ones',
            'not_equal',
            'sum',
+           'nansum',
            'tile',
            'greater_equal',
            'isfinite',
@@ -587,4 +615,9 @@
            'fmin',
            'moveaxis',
            'repeat_interleave',
+           'clone',
+           'kthvalue',
+           'renorm',
+           'take_along_axis',
+           'put_along_axis',
 ]
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index 661bc3485b3c6..86500db6591b3 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -16,7 +16,7 @@
 from . import backward_mode  # noqa: F401
 from .backward_mode import backward  # noqa: F401
 from .py_layer import PyLayer, PyLayerContext  # noqa: F401
-from ..framework import set_grad_enabled  # noqa: F401
+from ..framework import set_grad_enabled, is_grad_enabled  # noqa: F401
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
 from .functional import jacobian, hessian, batch_jacobian, batch_hessian  # noqa: F401
 from .functional import vjp, jvp, vhp  # noqa: F401
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 0a11d59d69c94..d102473fef791 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -29,12 +29,14 @@
     'get_device',
     'XPUPlace',
     'IPUPlace',
+    'MLUPlace',
     'is_compiled_with_xpu',
     'is_compiled_with_ipu',
     'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
-    'is_compiled_with_npu'
+    'is_compiled_with_npu',
+    'is_compiled_with_mlu'
 ]
 
 _cudnn_version = None
@@ -120,6 +122,41 @@ def XPUPlace(dev_id):
     return core.XPUPlace(dev_id)
 
 
+def is_compiled_with_mlu():
+    """
+    Whether paddle was built with WITH_MLU=ON to support Cambricon MLU
+
+    Returns (bool): whether paddle was built with WITH_MLU=ON
+
+    Examples:
+        .. code-block:: python
+
+            # required: mlu
+
+            import paddle
+            support_mlu = paddle.device.is_compiled_with_mlu()
+    """
+    return core.is_compiled_with_mlu()
+
+
+def MLUPlace(dev_id):
+    """
+    Return a Cambricon MLU Place
+
+    Parameters:
+        dev_id(int): MLU device id
+
+    Examples:
+        .. code-block:: python
+
+            # required: mlu
+
+            import paddle
+            place = paddle.device.MLUPlace(0)
+    """
+    return core.MLUPlace(dev_id)
+
+
 def get_cudnn_version():
     """
     This funciton return the version of cudnn. the retuen value is int which represents the 
@@ -181,13 +218,21 @@ def _convert_to_place(device):
                 "The device should not be 'ipu', " \
                 "since PaddlePaddle is not compiled with IPU")
         place = core.IPUPlace()
+    elif lower_device == 'mlu':
+        if not core.is_compiled_with_mlu():
+            raise ValueError("The device should not be 'mlu', "
+                             "since PaddlePaddle is not compiled with MLU")
+        selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
+        device_id = int(selected_mlus[0])
+        place = core.MLUPlace(device_id)
     else:
         avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
         avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
         avaliable_npu_device = re.match(r'npu:\d+', lower_device)
-        if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device:
+        avaliable_mlu_device = re.match(r'mlu:\d+', lower_device)
+        if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device:
             raise ValueError(
-                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu', 'npu:x' or ipu"
+                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'mlu', 'mlu:x', 'npu', 'npu:x' or ipu"
             )
         if avaliable_gpu_device:
             if not core.is_compiled_with_cuda():
@@ -216,19 +261,28 @@ def _convert_to_place(device):
             device_id = device_info_list[1]
             device_id = int(device_id)
             place = core.NPUPlace(device_id)
+        if avaliable_mlu_device:
+            if not core.is_compiled_with_mlu():
+                raise ValueError(
+                    "The device should not be {}, since PaddlePaddle is "
+                    "not compiled with mlu".format(avaliable_mlu_device))
+            device_info_list = device.split(':', 1)
+            device_id = device_info_list[1]
+            device_id = int(device_id)
+            place = core.MLUPlace(device_id)
     return place
 
 
 def set_device(device):
     """
-    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
+    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU, MLU and IPU.
     They are represented by string identifiers. This function can specify the global device
     which the OP will run.
 
     Parameters:
         device(str): This parameter determines the specific running device.
-            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
-            where ``x`` is the index of the GPUs, XPUs or NPUs.
+            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``mlu``, ``gpu:x``, ``xpu:x``, ``npu:x``, ``mlu:x`` and ``ipu``,
+            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs.
 
     Examples:
 
@@ -249,7 +303,7 @@ def set_device(device):
 def get_device():
     """
     This funciton can get the current global device of the program is running.
-    It's a string which is like 'cpu', 'gpu:x', 'xpu:x' and 'npu:x'. if the global device is not
+    It's a string which is like 'cpu', 'gpu:x', 'xpu:x', 'mlu:x' and 'npu:x'. if the global device is not
     set, it will return a string which is 'gpu:x' when cuda is avaliable or it 
     will return a string which is 'cpu' when cuda is not avaliable.
 
@@ -277,6 +331,9 @@ def get_device():
     elif isinstance(place, core.IPUPlace):
         num_devices = core.get_ipu_device_count()
         device = "ipus:{{0-{}}}".format(num_devices - 1)
+    elif isinstance(place, core.MLUPlace):
+        device_id = place.get_device_id()
+        device = 'mlu:' + str(device_id)
     else:
         raise ValueError("The device specification {} is invalid".format(place))
 
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 2a60aad2fd2de..29e1b2694a699 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -17,56 +17,41 @@
 
 if is_compiled_with_cuda() and not is_compiled_with_rocm():
     from paddle.fluid.core import CUDAGraph as CoreCUDAGraph
-
-    class CUDAGraph:
-        def __init__(self, place=None, mode="thread_local"):
-            ALL_MODES = ["global", "thread_local", "relaxed"]
-            self._graph = None
-            if place is None:
-                device_id = int(os.environ.get('FLAGS_selected_gpus', 0))
-                place = CUDAPlace(device_id)
-            self._place = place
-            assert mode in ALL_MODES
-            self._mode = ALL_MODES.index(mode)
-
-        def capture_begin(self):
-            CoreCUDAGraph.begin_capture(self._place, self._mode)
-
-        def capture_end(self):
-            self._graph = CoreCUDAGraph.end_capture()
-
-        def replay(self):
-            self._graph.replay()
-
-        def reset(self):
-            self._graph.reset()
-
-        def print_to_dot_files(self, dirname, flags=None):
-            if not isinstance(dirname, (str, bytes)):
-                dirname = dirname.name
-            os.makedirs(name=dirname, exist_ok=True)
-            assert os.path.isdir(
-                dirname), "The dirname {} should be a directory".format(dirname)
-            if flags is None:
-                flags = 2047  # only all information. It can be any integer inside [1, 2048)  
-            self._graph.print_to_dot_files(dirname, flags)
 else:
-
-    class CUDAGraph:
-        def __init__(self, place=None, mode="thread_local"):
-            raise NotImplementedError()
-
-        def capture_begin(self):
-            raise NotImplementedError()
-
-        def capture_end(self):
-            raise NotImplementedError()
-
-        def replay(self):
-            raise NotImplementedError()
-
-        def reset(self):
-            raise NotImplementedError()
-
-        def print_to_dot_files(self, dirname, flags=None):
-            raise NotImplementedError()
+    CoreCUDAGraph = None
+
+
+class CUDAGraph:
+    def __init__(self, place=None, mode="thread_local"):
+        assert CoreCUDAGraph is not None, "CUDA Graph is only supported on PaddlePaddle compiled with NVIDIA GPU."
+
+        ALL_MODES = ["global", "thread_local", "relaxed"]
+        self._graph = None
+        if place is None:
+            device_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+            place = CUDAPlace(device_id)
+        self._place = place
+        assert mode in ALL_MODES
+        self._mode = ALL_MODES.index(mode)
+
+    def capture_begin(self):
+        CoreCUDAGraph.begin_capture(self._place, self._mode)
+
+    def capture_end(self):
+        self._graph = CoreCUDAGraph.end_capture()
+
+    def replay(self):
+        self._graph.replay()
+
+    def reset(self):
+        self._graph.reset()
+
+    def print_to_dot_files(self, dirname, flags=None):
+        if not isinstance(dirname, (str, bytes)):
+            dirname = dirname.name
+        os.makedirs(name=dirname, exist_ok=True)
+        assert os.path.isdir(
+            dirname), "The dirname {} should be a directory".format(dirname)
+        if flags is None:
+            flags = 2047  # only all information. It can be any integer inside [1, 2048)  
+        self._graph.print_to_dot_files(dirname, flags)
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 745a018e8cf13..b03858119296e 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -698,13 +698,13 @@ def _get_op_by_id(ops, id):
             continue
 
         # complete the annotation of grad op (xxx_grad op or sum op)
-        # xxx_grad op will have a corresponding forward op in gradopidx2opidx
+        # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
         grad_op = ops[idx]
-        if grad_op.desc.id() in dist_op_context.gradopidx2opidx:
+        if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
             # TODO support the case where one forward op corresponding to multiple xxx_grad op
             forward_op = _get_op_by_id(
                 ops[:first_backward_op_idx],
-                dist_op_context.gradopidx2opidx[grad_op.desc.id()])
+                dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
             assert forward_op is not None
 
             # op dist attr
@@ -769,7 +769,7 @@ def _get_op_by_id(ops, id):
             dist_context.set_op_dist_attr_for_program(grad_op,
                                                       grad_op_dist_attr)
 
-        # only sum op for merge mutiple version grad has no a corresponding mapping in gradopidx2opidx
+        # only sum op for merge mutiple version grad has no a corresponding mapping in grad_op_id_to_op_id
         else:
             assert grad_op.type == "sum", "got unexpect op [{}]".format(
                 str(grad_op.type))
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
old mode 100755
new mode 100644
index 347d02dacf416..12bf14fcce5bd
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -46,14 +46,19 @@ class DistributedContext:
     """
 
     def __init__(self, program=None):
+        # Program related data members
         self._serial_program = program
-        self._serial_graph = None
         self._is_initialized_for_program = False
-        self._is_initialized_for_graph = False
         self._dist_tensors_for_program = {}
         self._dist_ops_for_program = {}
+        # Graph related data members
+        self._is_initialized_for_graph = False
+        self._serial_graph = None
         self._dist_tensors_for_graph = {}
         self._dist_ops_for_graph = {}
+        self._node_id_to_tensor_id = {}
+        self._node_id_to_op_id = {}
+        # Other data members
         self._dist_op_context = DistributedOperatorContext()
         self._process_meshes = []
 
@@ -97,19 +102,43 @@ def add_dist_op_for_program(self, dist_op):
 
     def get_dist_tensor_for_program(self, serial_tensor):
         serial_tensor_id = serial_tensor.desc.id()
-        return self._dist_tensors_for_program.get(serial_tensor_id, None)
+        dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id, None)
+        if dist_tensor:
+            return dist_tensor
+        else:
+            serial_tensor_id = serial_tensor.desc.original_id()
+            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
+                                                             None)
+            if dist_tensor:
+                return dist_tensor
+            else:
+                return None
 
     def get_dist_tensor_for_graph(self, serial_tensor_node):
         serial_tensor_node_id = serial_tensor_node.id()
         return self._dist_tensors_for_graph.get(serial_tensor_node_id, None)
 
-    def get_dist_op_for_program(self, serial_tensor):
+    def get_dist_op_for_program(self, serial_op):
+        serial_op_id = serial_op.desc.id()
+        dist_op = self._dist_ops_for_program.get(serial_op_id, None)
+        if dist_op:
+            return dist_op
+        else:
+            serial_op_id = serial_op.desc.original_id()
+            dist_op = self._dist_ops_for_program.get(serial_op_id, None)
+            if dist_op:
+                return dist_op
+            else:
+                return None
+
+    def del_dist_op_for_program(self, serial_tensor):
         serial_tensor_id = serial_tensor.desc.id()
-        return self._dist_ops_for_program.get(serial_tensor_id, None)
+        if self._dist_ops_for_program.get(serial_tensor_id, None):
+            del self._dist_ops_for_program[serial_tensor_id]
 
-    def get_dist_op_for_graph(self, serial_tensor_node):
-        serial_tensor_node_id = serial_tensor_node.id()
-        return self._dist_ops_for_graph.get(serial_tensor_node_id, None)
+    def get_dist_op_for_graph(self, serial_op_node):
+        serial_op_node_id = serial_op_node.id()
+        return self._dist_ops_for_graph.get(serial_op_node_id, None)
 
     def get_tensor_dist_attr_for_program(self, serial_tensor):
         serial_tensor_id = serial_tensor.desc.id()
@@ -117,7 +146,13 @@ def get_tensor_dist_attr_for_program(self, serial_tensor):
         if dist_tensor:
             return dist_tensor.dist_attr
         else:
-            return None
+            serial_tensor_id = serial_tensor.desc.original_id()
+            dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id,
+                                                             None)
+            if dist_tensor:
+                return dist_tensor.dist_attr
+            else:
+                return None
 
     def set_tensor_dist_attr_for_program(self, serial_tensor, dist_attr):
         dist_tensor = DistributedTensor(serial_tensor, dist_attr)
@@ -132,25 +167,18 @@ def get_tensor_dist_attr_for_graph(self, serial_tensor_node):
         else:
             return None
 
-    def set_tensor_dist_attr_for_graph(self, serial_tensor_node, dist_attr):
-        assert serial_tensor_node.is_var() and \
-            serial_tensor_node.var() is not None
-        serial_tensor_id = serial_tensor_node.var().id()
-        dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id, None)
-        assert dist_tensor is not None, \
-            "The distributed tensor of the program has not been added to this context."
-        serial_tensor_node_id = serial_tensor_node.id()
-        new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
-                                            dist_attr)
-        self._dist_tensors_for_graph[serial_tensor_node_id] = new_dist_tensor
-
     def get_op_dist_attr_for_program(self, serial_op):
         serial_op_id = serial_op.desc.id()
         dist_op = self._dist_ops_for_program.get(serial_op_id, None)
         if dist_op:
             return dist_op.dist_attr
         else:
-            return None
+            serial_op_id = serial_op.desc.original_id()
+            dist_op = self._dist_ops_for_program.get(serial_op_id, None)
+            if dist_op:
+                return dist_op.dist_attr
+            else:
+                return None
 
     def set_op_dist_attr_for_program(self, serial_op, dist_attr):
         dist_op = DistributedOperator(serial_op, dist_attr)
@@ -164,17 +192,6 @@ def get_op_dist_attr_for_graph(self, serial_op_node):
         else:
             return None
 
-    def set_op_dist_attr_for_graph(self, serial_op_node, dist_attr):
-        assert serial_op_node.is_op() and \
-            serial_op_node.op() is not None
-        serial_op_id = serial_op_node.op().id()
-        dist_op = self._dist_ops_for_program.get(serial_op_id, None)
-        assert dist_op is not None, \
-            "The distributed operator of the program has not been added to this context."
-        serial_op_node_id = serial_op_node.id()
-        new_dist_op = DistributedOperator(dist_op.serial_op, dist_attr)
-        self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
-
     def init_dist_attr_for_program(self):
         assert self._serial_program, \
             "Please set the program of this context before initializing its distribute attributes."
@@ -216,20 +233,36 @@ def init_dist_attr_for_graph(self):
         all_nodes = self._serial_graph.all_nodes()
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
-                tensor_desc = node.var()
-                tensor_id = tensor_desc.id()
-                dist_tensor = self._dist_tensors_for_program.get(tensor_id,
-                                                                 None)
+                dist_tensor = None
+                tensor_id = node.node.original_desc_id()
+                for cur_tensor_id, cur_dist_tensor in self._dist_tensors_for_program.items(
+                ):
+                    if tensor_id == cur_tensor_id \
+                        or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id():
+                        dist_tensor = cur_dist_tensor
+                        self._node_id_to_tensor_id[node.id()] = cur_tensor_id
                 assert dist_tensor is not None, \
                     "Tensor must have a distributed tensor after the initialization for program."
-                self.set_tensor_dist_attr_for_graph(node, dist_tensor.dist_attr)
+                serial_tensor_node_id = node.id()
+                new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
+                                                    dist_tensor.dist_attr)
+                self._dist_tensors_for_graph[
+                    serial_tensor_node_id] = new_dist_tensor
             if node.is_op() and node.op() is not None:
-                op_desc = node.op()
-                op_id = op_desc.id()
-                dist_op = self._dist_ops_for_program.get(op_id, None)
+                dist_op = None
+                op_id = node.node.original_desc_id()
+                for cur_op_id, cur_dist_op in self._dist_ops_for_program.items(
+                ):
+                    if op_id == cur_op_id \
+                        or op_id == cur_dist_op.serial_op.desc.original_id():
+                        dist_op = cur_dist_op
+                        self._node_id_to_op_id[node.id()] = cur_op_id
                 assert dist_op is not None, \
                     "Operator must have a distributed operator after the initialization for program."
-                self.set_op_dist_attr_for_graph(node, dist_op.dist_attr)
+                serial_op_node_id = node.id()
+                new_dist_op = DistributedOperator(dist_op.serial_op,
+                                                  dist_op.dist_attr)
+                self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
         self._is_initialized_for_graph = True
 
     def clear_dist_info_for_program(self):
@@ -247,9 +280,8 @@ def copy_dist_attr_from_graph_to_program(self):
         all_nodes = self._serial_graph.all_nodes()
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
-                tensor_desc = node.var()
-                tensor_id = tensor_desc.id()
-                updated = updated_tensors.get(tensor_desc.name(), False)
+                tensor_id = self._node_id_to_tensor_id[node.id()]
+                updated = updated_tensors.get(tensor_id, False)
                 # If a var has multiples var nodes in graph, only use the first one for now
                 if not updated:
                     tensor_dist_attr_for_graph = self.get_tensor_dist_attr_for_graph(
@@ -257,10 +289,9 @@ def copy_dist_attr_from_graph_to_program(self):
                     dist_tensor_for_program = self._dist_tensors_for_program[
                         tensor_id]
                     dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph
-                    updated_tensors[tensor_desc.name()] = True
+                    updated_tensors[tensor_id] = True
             if node.is_op() and node.op() is not None:
-                op_desc = node.op()
-                op_id = op_desc.id()
+                op_id = self._node_id_to_op_id[node.id()]
                 op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
                 dist_op_for_program = self._dist_ops_for_program[op_id]
                 dist_op_for_program.dist_attr = op_dist_attr_for_graph
@@ -360,7 +391,7 @@ def __init__(self):
         self._rank_id = None
         self._cur_src_op = None
         self._cur_dist_attr = None
-        self.gradopidx2opidx = {}
+        self.grad_op_id_to_op_id = {}
         self.already_init_sync_vars = set()
 
     def __deepcopy__(self, memo):
@@ -404,7 +435,7 @@ def set_cur_src_op(self, cur_src_op):
     def get_cur_src_op(self):
         return self._cur_src_op
 
-    def prepare_forward_context(self, src_op):
+    def prepare_context(self, src_op):
 
         self.set_cur_src_op(src_op)
 
@@ -413,6 +444,7 @@ def prepare_forward_context(self, src_op):
         for input_name in src_op.desc.input_names():
             varnames = []
             for varname in src_op.desc.input(input_name):
+                assert varname in self._varname_mapping
                 varnames.append(self._varname_mapping[varname])
             kinputs[input_name] = varnames
 
@@ -421,29 +453,8 @@ def prepare_forward_context(self, src_op):
         for output_name in src_op.desc.output_names():
             varnames = []
             for varname in src_op.desc.output(output_name):
+                assert varname in self._varname_mapping
                 varnames.append(self._varname_mapping[varname])
             koutputs[output_name] = varnames
 
         return kinputs, koutputs
-
-    def prepare_backward_context(self, backward_op):
-
-        self.set_cur_src_op(backward_op)
-
-        # build input varname mapping
-        kinputs = {}
-        for input_name in backward_op.desc.input_names():
-            varnames = []
-            for varname in backward_op.desc.input(input_name):
-                varnames.append(varname)
-            kinputs[input_name] = varnames
-
-        # build output varname mapping
-        koutputs = {}
-        for output_name in backward_op.desc.output_names():
-            varnames = []
-            for varname in backward_op.desc.output(output_name):
-                varnames.append(varname)
-            koutputs[output_name] = varnames
-
-        return kinputs, koutputs
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 3ebda4694c630..32496b94b920c 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+from ..dist_attribute import OperatorDistributedAttribute
+
 _g_distributed_operator_impl_registries = {}
+BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale'}
 
 
 class DistributedOperatorImplContainer:
@@ -114,6 +117,14 @@ def find_best_compatible_distributed_operator_impl(name, dist_op, fwd=True):
     return best_compatible_impl, idx
 
 
+def is_parameter_related(varname, block):
+    if ".cast_fp" in varname:
+        varname = varname[:varname.index(".cast_fp")]
+    assert block.has_var(varname)
+    var = block.var(varname)
+    return var.is_parameter
+
+
 def infer_shape(block, src_var, src_var_dist_attr, op_input_dist_attr):
     var_shape = block.var(src_var.name).shape
     var_topoloy = src_var_dist_attr.process_mesh.topology
@@ -138,3 +149,46 @@ def infer_shape(block, src_var, src_var_dist_attr, op_input_dist_attr):
             exact_shape.append(new_shape)
 
     return exact_shape
+
+
+def set_comm_op_dist_attr_for_program(new_op, process_mesh, tensor_dist_attr,
+                                      ctx):
+    assert process_mesh is not None
+    assert tensor_dist_attr is not None
+
+    new_op_dist_attr = OperatorDistributedAttribute()
+    new_op_dist_attr.process_mesh = process_mesh
+    for input_varname in new_op.desc.input_arg_names():
+        new_op_dist_attr.set_input_dist_attr(input_varname, tensor_dist_attr)
+    for output_varname in new_op.desc.output_arg_names():
+        new_op_dist_attr.set_output_dist_attr(output_varname, tensor_dist_attr)
+    ctx.set_op_dist_attr_for_program(new_op, new_op_dist_attr)
+
+
+def naive_copy_op_dist_attr_for_program(new_op, ref_op, ctx):
+
+    ref_dist_attr = ctx.get_op_dist_attr_for_program(ref_op)
+    new_op_dist_attr = OperatorDistributedAttribute()
+    new_op_dist_attr.process_mesh = ref_dist_attr.process_mesh
+
+    for input_name in ref_op.input_names:
+        assert input_name in new_op.input_names
+        assert len(ref_op.input(input_name)) == 1
+        assert len(new_op.input(input_name)) == 1
+
+        ref_tensor_dist_attr = ref_dist_attr.get_input_dist_attr(
+            ref_op.input(input_name)[0])
+        new_op_dist_attr.set_input_dist_attr(
+            new_op.input(input_name)[0], ref_tensor_dist_attr)
+
+    for output_name in ref_op.output_names:
+        assert output_name in new_op.output_names
+        assert len(ref_op.output(output_name)) == 1
+        assert len(new_op.output(output_name)) == 1
+
+        ref_tensor_dist_attr = ref_dist_attr.get_output_dist_attr(
+            ref_op.output(output_name)[0])
+        new_op_dist_attr.set_output_dist_attr(
+            new_op.output(output_name)[0], ref_tensor_dist_attr)
+
+    ctx.set_op_dist_attr_for_program(new_op, new_op_dist_attr)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
old mode 100755
new mode 100644
index 05af1b402b425..1a3d57bf140dd
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -15,13 +15,14 @@
 from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
-from .common import register_distributed_operator_impl
+from .common import register_distributed_operator_impl, is_parameter_related
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
 from ..utils import is_valid_list_index
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import set_dist_op_desc_original_id
 from ..dist_attribute import OperatorDistributedAttribute
 from paddle.fluid import core, unique_name
 from paddle.fluid.framework import in_dygraph_mode
@@ -66,7 +67,6 @@ def forward(ctx, *args, **kwargs):
         main_block = dist_op_context.get_dst_main_program().global_block()
         startup_block = dist_op_context.get_dst_startup_program().global_block()
         src_op = dist_op_context.get_cur_src_op()
-        varname_mapping = dist_op_context.get_varname_mapping()
         rank_id = dist_op_context.get_rank_id()
 
         # check validation of inputs / outputs
@@ -87,6 +87,7 @@ def forward(ctx, *args, **kwargs):
         # replicate op in dist program
         dist_op_desc = main_block.desc.append_op()
         dist_op_desc.copy_from(src_op.desc)
+        set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
             dist_op_desc.set_input(input_name, kwargs[input_name])
         for output_name in src_op.desc.output_names():
@@ -153,14 +154,41 @@ def backward(ctx, *args, **kwargs):
             str(backward_op))
         rank_id = dist_op_context.get_rank_id()
 
+        # check validation of inputs / outputs
+        for input_name in backward_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                backward_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in backward_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                backward_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        # replicate op in dist program
+        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc.copy_from(backward_op.desc)
+        # Refer to the related dist op
+        set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
+        for input_name in backward_op.desc.input_names():
+            dist_op_desc.set_input(input_name, kwargs[input_name])
+        for output_name in backward_op.desc.output_names():
+            dist_op_desc.set_output(output_name, kwargs[output_name])
+
+        main_block._sync_with_cpp()
+
         # check if need gradient allreduce
         # if there is a non-gradient & non-parameter input and its batch dimension is splited,
         # we need insert gradient allreduce for the gradient of parameter in its output
         need_gradient_allreduce = False
         for input_name in backward_op.desc.input_names():
             for varname in backward_op.desc.input(input_name):
-                if "@GRAD" not in varname and not main_block.var(
-                        varname).is_parameter:
+                if "@GRAD" not in varname and not is_parameter_related(
+                        varname, main_block):
 
                     # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op
                     process_mesh = dist_attr.process_mesh
@@ -186,8 +214,8 @@ def backward(ctx, *args, **kwargs):
             allreduce_vars = []
             for input_name in backward_op.desc.input_names():
                 for varname in backward_op.desc.input(input_name):
-                    if "@GRAD" not in varname and main_block.var(
-                            varname).is_parameter:
+                    if "@GRAD" not in varname and is_parameter_related(
+                            varname, main_block):
                         assert len(
                             backward_op.desc.input(input_name)
                         ) == 1, "parameter input to grad op should be length 1, but got [{}]".format(
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
old mode 100755
new mode 100644
index 20722cdf60576..866fed1ae6067
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -16,17 +16,17 @@
 from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
-from .common import register_distributed_operator_impl
+from .common import register_distributed_operator_impl, set_comm_op_dist_attr_for_program, naive_copy_op_dist_attr_for_program, is_parameter_related
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
 from ..utils import is_valid_list_index
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
-from ..dist_attribute import OperatorDistributedAttribute
+from ..dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.fluid import core, unique_name
 from paddle.fluid.framework import in_dygraph_mode
-from paddle.fluid.framework import Program, Parameter, Variable, program_guard
+from paddle.fluid.framework import Program, Parameter, Variable
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
 from ..process_group import new_process_group
@@ -283,34 +283,35 @@ def forward(ctx, *args, **kwargs):
                                          allreduce_op_dist_attr)
 
         # param initialization sync
-        assert Weight_var.name not in dist_op_context.already_init_sync_vars
-        dist_op_context.already_init_sync_vars.add(Weight_var.name)
-        param = startup_block.var(Weight_var.name)
-        param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
-        process_mesh = param_dist_attr.process_mesh
-        dim_mapping = param_dist_attr.dims_mapping
-
-        # NOTE all not splited axis should be presented in mesh
-        for axis, size in enumerate(process_mesh.topology):
-            if size <= 1 or axis in dim_mapping:
-                pass
-            else:
-                group_ranks = _get_comm_group(process_mesh.processes,
-                                              process_mesh.topology, axis,
-                                              rank_id)
-                sync_group = new_process_group(group_ranks)
-
-                startup_block.append_op(
-                    type='c_broadcast',
-                    inputs={'X': param},
-                    outputs={'Out': param},
-                    attrs={
-                        'ring_id': sync_group.id,
-                        'root': 0,
-                        'use_calc_stream': True,
-                        OP_ROLE_KEY: OpRole.Forward
-                    })
-        startup_block._sync_with_cpp()
+        if Weight_var.is_parameter:
+            assert Weight_var.name not in dist_op_context.already_init_sync_vars
+            dist_op_context.already_init_sync_vars.add(Weight_var.name)
+            param = startup_block.var(Weight_var.name)
+            param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
+            process_mesh = param_dist_attr.process_mesh
+            dim_mapping = param_dist_attr.dims_mapping
+
+            # NOTE all not splited axis should be presented in mesh
+            for axis, size in enumerate(process_mesh.topology):
+                if size <= 1 or axis in dim_mapping:
+                    pass
+                else:
+                    group_ranks = _get_comm_group(process_mesh.processes,
+                                                  process_mesh.topology, axis,
+                                                  rank_id)
+                    sync_group = new_process_group(group_ranks)
+
+                    startup_block.append_op(
+                        type='c_broadcast',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={
+                            'ring_id': sync_group.id,
+                            'root': 0,
+                            'use_calc_stream': True,
+                            OP_ROLE_KEY: OpRole.Forward
+                        })
+            startup_block._sync_with_cpp()
 
     @staticmethod
     def backward(ctx, *args, **kwargs):
@@ -329,9 +330,6 @@ def backward(ctx, *args, **kwargs):
             rank_id = _get_corresponding_rank(ctx, dist_attr.process_mesh,
                                               rank_id)
 
-        # check if need gradient allreduce
-        need_gradient_allreduce = False
-
         assert 'Ids' in kwargs, "input [{}] is not given".format('Ids')
         assert 'W' in kwargs, "input [{}] is not given".format('W')
         assert 'Out@GRAD' in kwargs, "input [{}] is not given".format('Out')
@@ -355,6 +353,84 @@ def backward(ctx, *args, **kwargs):
             kwargs['W@GRAD'])
 
         Ids_var = main_block.var(kwargs['Ids'][0])
+        Weight_var = main_block.var(kwargs['W'][0])
+        Out_grad = main_block.var(kwargs['Out@GRAD'][0])
+        Weight_grad = main_block.var(kwargs['W@GRAD'][0])
+
+        embedding_row_dim_mapping = dist_attr.get_input_dims_mapping(
+            Weight_var.name)[0]
+        assert embedding_row_dim_mapping >= 0, "row_parallel_embedding's row should be divided by a specific mesh axis, but got [{}]".format(
+            embedding_row_dim_mapping)
+        process_mesh_shape = dist_attr.process_mesh.topology
+        process_mesh_group = dist_attr.process_mesh.processes
+
+        # A generalized method to caculate embedding offset using cartisian product
+        relative_idx = _get_idx_in_axis(process_mesh_group, process_mesh_shape,
+                                        embedding_row_dim_mapping, rank_id)
+        per_part_size = Weight_var.shape[0]
+        relative_idx = relative_idx * per_part_size
+
+        check_variable_and_dtype(
+            Out_grad, 'tensor',
+            ['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity')
+
+        intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_embedding", '@tmp_0@GRAD'])),
+            dtype=Out_grad.dtype,
+            shape=Out_grad.shape,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=Out_grad.stop_gradient)
+
+        # copy X_var's dist_attr to intermediate_var_0's dist_attr
+        out_grad_dist_attr = dist_attr.get_input_dist_attr(Out_grad.name)
+        assert out_grad_dist_attr is not None
+        ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
+                                             out_grad_dist_attr)
+
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      embedding_row_dim_mapping, rank_id)
+        group = new_process_group(group_ranks)
+
+        c_identity_op = main_block.append_op(
+            type='c_identity',
+            inputs={'X': [Out_grad]},
+            outputs={'Out': intermediate_var_0},
+            attrs={
+                'ring_id': group.id,
+                'use_calc_stream': True,
+                'use_model_parallel': True,
+                OP_ROLE_KEY: OpRole.Backward,
+            })
+        check_variable_and_dtype(intermediate_var_0, 'x',
+                                 ['float16', 'float32', 'float64'], 'linear')
+        check_dtype(intermediate_var_0.dtype, 'dtype',
+                    ['float16', 'float32', 'float64'], 'linear')
+
+        set_comm_op_dist_attr_for_program(c_identity_op, dist_attr.process_mesh,
+                                          out_grad_dist_attr, ctx)
+
+        main_block._sync_with_cpp()
+        c_embedding_grad_op_desc = main_block.desc.append_op()
+        c_embedding_grad_op_desc.set_type("c_embedding_grad")
+        c_embedding_grad_op_desc.set_input('Ids', [Ids_var.name])
+        c_embedding_grad_op_desc.set_input('W', [Weight_var.name])
+        c_embedding_grad_op_desc.set_input('Out@GRAD',
+                                           [intermediate_var_0.name])
+        c_embedding_grad_op_desc.set_output('W@GRAD', [Weight_grad.name])
+        c_embedding_grad_op_desc._set_attr('start_index', relative_idx)
+        c_embedding_grad_op_desc._set_attr(OP_ROLE_KEY, OpRole.Backward)
+        main_block._sync_with_cpp()
+
+        c_embedding_grad_op = main_block.ops[-1]
+        assert c_embedding_grad_op.type == "c_embedding_grad"
+        naive_copy_op_dist_attr_for_program(c_embedding_grad_op, backward_op,
+                                            ctx)
+
+        # check if need gradient allreduce
+        need_gradient_allreduce = False
+
         process_mesh = dist_attr.process_mesh
         var_dim_mapping = dist_attr.get_input_dims_mapping(Ids_var.name)
         mesh_shape = process_mesh.topology
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 3a4d8412bf835..f4c31c3654c52 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -12,17 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+import copy
 from .common import infer_shape
 from .common import DistributedOperatorImplContainer
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
+from .common import set_comm_op_dist_attr_for_program, naive_copy_op_dist_attr_for_program, is_parameter_related
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
 from ..utils import is_valid_list_index
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import set_dist_op_desc_original_id
 from ..dist_attribute import OperatorDistributedAttribute
 from paddle.fluid import core, unique_name
 from paddle.fluid.framework import in_dygraph_mode
@@ -33,6 +36,21 @@
 from ..utils import _get_comm_group, _get_corresponding_rank
 
 
+def copy_op_with_new_input_output(ctx, block, src_op, **kwargs):
+    dist_op_desc = block.desc.append_op()
+    dist_op_desc.copy_from(src_op.desc)
+    set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
+    for input_name in src_op.desc.input_names():
+        assert input_name in kwargs
+        dist_op_desc.set_input(input_name, kwargs[input_name])
+    for output_name in src_op.desc.output_names():
+        assert input_name in kwargs
+        dist_op_desc.set_output(output_name, kwargs[output_name])
+
+    block._sync_with_cpp()
+    return dist_op_desc
+
+
 def _update_dims_mapping_for_matmul(dist_op):
     changed = False
     op_desc = dist_op.serial_op.desc
@@ -141,15 +159,11 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
     if rank_id not in dist_attr.process_mesh.processes:
         rank_id = _get_corresponding_rank(ctx, dist_attr.process_mesh, rank_id)
 
-    # check if need gradient allreduce
-    need_gradient_allreduce = False
-
     assert 'Y' in kwargs, "input [{}] is not given".format('Y')
     assert 'X' in kwargs, "input [{}] is not given".format('X')
     assert 'Out@GRAD' in kwargs, "input [{}] is not given".format('Out@GRAD')
     assert 'Y@GRAD' in kwargs, "output [{}] is not given".format('Y@GRAD')
     assert 'X@GRAD' in kwargs, "output [{}] is not given".format('X@GRAD')
-
     assert len(
         kwargs['Y']
     ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format(
@@ -166,15 +180,140 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
         kwargs['Y@GRAD']
     ) == 1, "row_parallel_embedding output Ids take 1 variable but got {}".format(
         kwargs['Y@GRAD'])
-    assert len(
-        kwargs['X@GRAD']
-    ) == 1, "row_parallel_embedding output Ids take 1 variable but got {}".format(
-        kwargs['X@GRAD'])
 
     X_var = main_block.var(kwargs['X'][0])
-    assert not X_var.is_parameter, "left operand(X) [{}] of dist matmul should not be parameter".format(
+    Y_var = main_block.var(kwargs['Y'][0])
+    Out_grad = main_block.var(kwargs['Out@GRAD'][0])
+    Y_grad = main_block.var(kwargs['Y@GRAD'][0])
+
+    assert not is_parameter_related(
+        X_var.name, main_block
+    ), "left operand(X) [{}] of dist matmul should not be parameter".format(
         X_var.name)
 
+    Y_var_dim_mapping = dist_attr.get_input_dims_mapping(Y_var.name)
+    process_mesh_shape = dist_attr.process_mesh.topology
+    process_mesh_group = dist_attr.process_mesh.processes
+    assert len(
+        Y_var_dim_mapping
+    ) == 2, "dist matmual only support Y operand with 2 dims now but Y({})'s dim is [{}]".format(
+        Y_var.name, Y_var_dim_mapping)
+    Y_var_partitioned = False
+    for dim in Y_var_dim_mapping:
+        if dim >= 0 and process_mesh_shape[dim] > 0:
+            Y_var_partitioned = True
+            break
+
+    if is_parameter_related(Y_var.name, main_block) and Y_var_partitioned:
+
+        if Y_var_dim_mapping[0] >= 0:
+            # row parallel: c_identity + matmul
+            assert Y_var_dim_mapping[1] < 0
+            parallel_axis = Y_var_dim_mapping[0]
+
+            check_variable_and_dtype(
+                Out_grad, 'tensor',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                '_c_identity')
+
+            intermediate_var_0 = main_block.create_var(
+                name=unique_name.generate_with_ignorable_key(".".join(
+                    ["c_identity", 'tmp'])) + "@GRAD",
+                dtype=Out_grad.dtype,
+                shape=Out_grad.shape,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=Out_grad.stop_gradient)
+
+            # copy X_var's dist_attr to intermediate_var_0's dist_attr
+            out_grad_dist_attr = dist_attr.get_input_dist_attr(Out_grad.name)
+            assert out_grad_dist_attr is not None
+            ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
+                                                 out_grad_dist_attr)
+
+            group_ranks = _get_comm_group(
+                process_mesh_group, process_mesh_shape, parallel_axis, rank_id)
+            group = new_process_group(group_ranks)
+            c_identity_op = main_block.append_op(
+                type='c_identity',
+                inputs={'X': [Out_grad]},
+                outputs={'Out': intermediate_var_0},
+                attrs={
+                    'ring_id': group.id,
+                    'use_calc_stream': True,
+                    'use_model_parallel': True,
+                    OP_ROLE_KEY: OpRole.Backward,
+                })
+            check_variable_and_dtype(intermediate_var_0, 'x',
+                                     ['float16', 'float32', 'float64'],
+                                     'linear')
+            check_dtype(intermediate_var_0.dtype, 'dtype',
+                        ['float16', 'float32', 'float64'], 'linear')
+            set_comm_op_dist_attr_for_program(
+                c_identity_op, dist_attr.process_mesh, out_grad_dist_attr, ctx)
+
+            new_kwargs = copy.deepcopy(kwargs)
+            new_kwargs['Out@GRAD'] = [intermediate_var_0.name]
+            matmul_op_desc = copy_op_with_new_input_output(
+                ctx, main_block, backward_op, **new_kwargs)
+        else:
+            # col parallel: matmul + allreduce
+            assert Y_var_dim_mapping[0] < 0
+            parallel_axis = Y_var_dim_mapping[1]
+            new_kwargs = copy.deepcopy(kwargs)
+
+            # NOTE (JZ-LIANG) should allow left operand be empty for matmul grad
+            has_x_grad = len(kwargs['X@GRAD']) > 0
+            if has_x_grad:
+                assert len(kwargs['X@GRAD']) == 1
+                X_grad = main_block.var(kwargs['X@GRAD'][0])
+                intermediate_var_0 = main_block.create_var(
+                    name=unique_name.generate_with_ignorable_key(".".join(
+                        ["c_identity", 'tmp'])) + "@GRAD",
+                    dtype=X_grad.dtype,
+                    shape=X_grad.shape,
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    persistable=False,
+                    stop_gradient=X_grad.stop_gradient)
+
+                X_grad_dist_attr = dist_attr.get_output_dist_attr(X_grad.name)
+                assert X_grad_dist_attr is not None
+                ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
+                                                     X_grad_dist_attr)
+                new_kwargs['X@GRAD'] = [intermediate_var_0.name]
+
+            matmul_op_desc = copy_op_with_new_input_output(
+                ctx, main_block, backward_op, **new_kwargs)
+
+            # NOTE (JZ-LIANG) trick to skip one allreduce if left operand has not grad
+            if has_x_grad:
+                group_ranks = _get_comm_group(process_mesh_group,
+                                              process_mesh_shape, parallel_axis,
+                                              rank_id)
+                group = new_process_group(group_ranks)
+                c_allreduce_sum_op = main_block.append_op(
+                    type='c_allreduce_sum',
+                    inputs={'X': [intermediate_var_0.name]},
+                    outputs={'Out': kwargs['X@GRAD']},
+                    attrs={
+                        'ring_id': group.id,
+                        'use_calc_stream': True,
+                        'use_model_parallel': True,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+                set_comm_op_dist_attr_for_program(c_allreduce_sum_op,
+                                                  dist_attr.process_mesh,
+                                                  X_grad_dist_attr, ctx)
+    else:
+        # replicate
+        matmul_op_desc = copy_op_with_new_input_output(ctx, main_block,
+                                                       backward_op, **kwargs)
+
+    main_block._sync_with_cpp()
+
+    # check if need gradient allreduce
+    need_gradient_allreduce = False
+
     process_mesh = dist_attr.process_mesh
     var_dim_mapping = dist_attr.get_input_dims_mapping(X_var.name)
     mesh_shape = process_mesh.topology
@@ -187,8 +326,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
         dp_degree = len(group_ranks)
         dp_group = new_process_group(group_ranks)
 
-    Y_var = main_block.var(kwargs['Y'][0])
-    if need_gradient_allreduce and Y_var.is_parameter:
+    if need_gradient_allreduce and is_parameter_related(Y_var.name, main_block):
         Y_Grad_var = main_block.var(kwargs['Y@GRAD'][0])
         allreduce_op = main_block.append_op(
             type='c_allreduce_sum',
@@ -310,6 +448,7 @@ def is_auto_compatible(self, dist_op):
             y_dims_mapping), "now just support x dims > y dims"
         if len(y_dims_mapping) != 2:
             return False
+
         if len(x_dims_mapping) == len(y_dims_mapping) and len(
                 x_dims_mapping) == 4:
             if x_dims_mapping[:2] != y_dims_mapping[:2]:
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
index d72d13803ff3a..e287bd75b3589 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
@@ -22,6 +22,7 @@
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import set_dist_op_desc_original_id
 from paddle.fluid import core, unique_name
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.framework import Program, Parameter, Variable, program_guard
@@ -43,7 +44,7 @@ def __init__(self, name):
         super(DistributedReshapeImpl0, self).__init__()
         self._name = name
         self._forward_implemented = True
-        self._backward_implemented = True
+        self._backward_implemented = False
 
     def is_input_compatible(self, dist_op):
         op_desc = dist_op.serial_op.desc
@@ -181,6 +182,7 @@ def forward(ctx, *args, **kwargs):
         # create op
         new_op_desc = main_block.desc.append_op()
         new_op_desc.copy_from(src_op.desc)
+        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
         new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
         new_op_desc.set_input('Shape', Shape_var_list)
         new_op_desc.set_input('X', [X_var.name])
@@ -200,7 +202,7 @@ def __init__(self, name):
         super(DistributedReshapeImpl1, self).__init__()
         self._name = name
         self._forward_implemented = True
-        self._backward_implemented = True
+        self._backward_implemented = False
 
     def is_input_compatible(self, dist_op):
         op_desc = dist_op.serial_op.desc
@@ -345,6 +347,7 @@ def forward(ctx, *args, **kwargs):
         # create op
         new_op_desc = main_block.desc.append_op()
         new_op_desc.copy_from(src_op.desc)
+        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
         new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
         new_op_desc.set_input('Shape', Shape_var_list)
         new_op_desc.set_input('X', [X_var.name])
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
index de2d0ba62e62e..e4624b51222ed 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py
@@ -39,7 +39,7 @@ def __init__(self, name):
         super(DistributedSoftmaxImpl, self).__init__()
         self._name = name
         self._forward_implemented = False
-        self._backward_implemented = True
+        self._backward_implemented = False
 
     def is_input_compatible(self, dist_op):
         op_desc = dist_op.serial_op.desc
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
index 98c468105180f..8b40524e47315 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py
@@ -39,7 +39,7 @@ def __init__(self, name):
         super(DistributedTranspose2Impl, self).__init__()
         self._name = name
         self._forward_implemented = False
-        self._backward_implemented = True
+        self._backward_implemented = False
 
     def is_input_compatible(self, dist_op):
         return True
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index f6ddf2b9b7350..7cad4d746bbf2 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -22,15 +22,17 @@
 import logging
 import pickle
 import time
-
 import paddle
+from paddle.fluid.backward import append_backward
 from paddle.distributed.utils import get_logger
 from paddle.distributed.fleet import cloud_utils
 import paddle.fluid.core as core
+from paddle.fluid import program_guard
+from paddle.distributed.passes import new_pass, PassContext
 from .dist_context import DistributedContext
 from .dist_context import get_default_distributed_context
 from .dist_context import set_default_distributed_context
-from .completion import complete_annotation, complete_backward_annotation
+from .completion import complete_annotation, complete_backward_annotation, complete_update_annotation
 from .partitioner import Partitioner
 from .process_group import get_all_process_groups
 from .process_group import get_process_group
@@ -38,6 +40,7 @@
 from .process_group import _g_process_group_map, ProcessGroup
 from .utils import make_data_unshard
 from .utils import set_grad_var_shape
+from .utils import print_program_with_dist_attr
 from .utils import SerialProgramInfo
 from .reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER
 from .cluster import Cluster
@@ -45,6 +48,7 @@
 from .dist_op import DistributedOperator
 from .dist_tensor import DistributedTensor
 from .planner import Planner
+from paddle.distributed.passes import new_pass, PassContext
 
 _logger = get_logger(logging.INFO)
 
@@ -76,9 +80,12 @@ def __init__(self, fleet):
             self._enable_auto_mapping = False
         else:
             self._enable_auto_mapping = True
+        self._pass_context = PassContext()
+
         self._need_rank_mapping = os.getenv("PADDLE_NEED_RANK_MAPPING")
         self._need_rank_mapping = True if self._need_rank_mapping and \
             self._need_rank_mapping.lower() == 'true' else False
+        self._pass_context = None
 
     def _remove_distributed_attrs(self, main_program):
         suffix = core.kAutoParallelSuffix()
@@ -90,28 +97,106 @@ def _remove_distributed_attrs(self, main_program):
                     if suffix in attr_name:
                         op._remove_attr(attr_name)
 
+    def _apply_serial_pass(self, main_program, startup_program):
+
+        # apply amp pass
+        if self._dist_strategy.amp:
+            auto_parallel_amp_pass = new_pass("auto_parallel_amp_pass",
+                                              self._dist_strategy.amp_configs)
+            auto_parallel_amp_pass.apply(main_program, startup_program,
+                                         self._pass_context)
+
+        # apply recompute pass
+        if self._dist_strategy.recompute:
+            auto_parallel_recompute_pass = new_pass(
+                "auto_parallel_recompute_pass",
+                self._dist_strategy.recompute_configs)
+            auto_parallel_recompute_pass.apply(main_program, startup_program,
+                                               self._pass_context)
+
+    def _generate_backward(self, main_program, startup_program, loss,
+                           parameter_list, no_grad_set, callbacks):
+
+        with program_guard(main_program, startup_program):
+            params_grads = append_backward(
+                loss,
+                parameter_list,
+                no_grad_set,
+                callbacks,
+                distop_context=self._dist_context.dist_op_context)
+        complete_backward_annotation(
+            main_program, dist_context=self._dist_context)
+
+        return params_grads
+
+    def _apply_optimize(self, main_program, startup_program, params_grads):
+
+        with program_guard(main_program, startup_program):
+            optimize_ops = copy.deepcopy(self._optimizer).apply_gradients(
+                params_grads)
+
+        # update completion 
+        complete_update_annotation(
+            main_program, dist_context=self._dist_context)
+
+        return optimize_ops
+
+    def _apply_post_optimization_passed(self, main_program, startup_program,
+                                        rank, params_grads):
+
+        if self._dist_strategy.sharding:
+            config = copy.deepcopy(self._dist_strategy.sharding_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            config["global_rank"] = rank
+            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
+                                                   config)
+            auto_parallel_sharding_pass.apply(
+                [main_program], [startup_program], self._pass_context)
+
+        if self._dist_strategy.gradient_merge:
+            config = copy.deepcopy(self._dist_strategy.gradient_merge_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            auto_parallel_gradient_merge_pass = new_pass(
+                "auto_parallel_gradient_merge_pass", config)
+            auto_parallel_gradient_merge_pass.apply(
+                [main_program], [startup_program], self._pass_context)
+
     def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
         completed_main_program = None
+        serial_main_program = self._main_program.clone()
+        serial_startup_program = self._startup_program.clone()
+        serial_loss = serial_main_program.global_block().var(self._loss.name)
+        # generating serial 
         if dist_context is None:
             # Annotation completion
             self._dist_context = DistributedContext()
             _logger.info("Start annotation dist attr.")
-            completed_main_program = complete_annotation(self._main_program,
+            completed_main_program = complete_annotation(serial_main_program,
                                                          self._dist_context)
         else:
-            completed_main_program = self._main_program
+            completed_main_program = serial_main_program
             self._dist_context = copy.deepcopy(dist_context)
 
-        # Logical partition
-        partitioner = Partitioner(self._dist_strategy, self._dist_context, rank)
-        dist_main_prog, dist_startup_prog = partitioner.transpile_forward(
-            completed_main_program, self._startup_program)
-        dist_params_grads = partitioner.apply_backward(
-            self._loss, completed_main_program, self._startup_program,
-            dist_main_prog, dist_startup_prog)
-        dist_optimize_ops = partitioner.apply_optimize(
-            copy.deepcopy(self._optimizer), dist_params_grads, dist_main_prog,
-            dist_startup_prog)
+        # serial backward pass
+        params_grads = self._generate_backward(
+            completed_main_program, serial_startup_program, serial_loss,
+            self._parameter_list, self._no_grad_set, self._callbacks)
+
+        # serial forward pass
+        self._apply_serial_pass(completed_main_program, serial_startup_program)
+
+        # Logical partition 
+        rank = paddle.distributed.get_rank()
+        partitioner = Partitioner(self._dist_context, rank)
+        dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
+            completed_main_program, serial_startup_program, params_grads)
+
+        # TODO refactor the placement of optimizer
+        # generate optimize program
+        dist_optimize_ops = self._apply_optimize(
+            dist_main_prog, dist_startup_prog, dist_params_grads)
 
         set_grad_var_shape(dist_main_prog, self._dist_context)
 
@@ -119,6 +204,8 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
 
         reshard(dist_main_prog, dist_startup_prog, rank, self._dist_context)
 
+        self._apply_post_optimization_passed(dist_main_prog, dist_startup_prog,
+                                             rank, dist_params_grads)
         g_process_group_map = None
         if not relaunch_phase:
             g_process_group_map = copy.deepcopy(_g_process_group_map)
@@ -133,13 +220,15 @@ def parallelize(self,
                     loss,
                     startup_program,
                     parameter_list=None,
-                    no_grad_set=None):
+                    no_grad_set=None,
+                    callbacks=None):
         assert startup_program is not None
         self._loss = loss
         self._startup_program = startup_program
         self._main_program = loss.block.program
         self._parameter_list = parameter_list
         self._no_grad_set = no_grad_set
+        self._callbacks = callbacks
 
         if self._enable_auto_mapping and self._need_rank_mapping:
             # Do the mapping pass before parallelization
@@ -156,6 +245,7 @@ def parallelize(self,
                     self._optimizer, self._cluster)
                 planner = Planner(
                     serial_program_info,
+                    self,
                     algorithm_config={"name": "mcmc",
                                       "max_search_times": 5})
                 dist_context, _ = planner.search()
@@ -262,6 +352,7 @@ def parallelize(self,
                         cluster=self._cluster)
                     planner = Planner(
                         serial_program_info,
+                        self,
                         algorithm_config={
                             "name": "mcmc",
                             "max_search_times": 5
@@ -303,3 +394,14 @@ def parallelize(self,
             self._remove_distributed_attrs(dist_main_prog)
 
             return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+        for k, v in self.__dict__.items():
+            if k == "_main_program" or k == "_startup_program" or k == "_dist_context" or k == "_fleet" or k == "_loss":
+                setattr(result, k, v)
+            else:
+                setattr(result, k, copy.deepcopy(v, memo))
+        return result
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
old mode 100755
new mode 100644
index 9af194e810fb6..76a9faa1c8398
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -20,18 +20,13 @@
 from paddle.fluid import framework as framework
 from paddle.fluid import core, unique_name
 from paddle.fluid.framework import Program, Parameter, Variable, program_guard
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_
 from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
-from paddle.fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops, ClipGradByGlobalNorm
-from paddle.distributed.fleet.base.distributed_strategy import DistributedStrategy
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
-from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op, is_optimizer_op
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
 from .dist_attribute import OperatorDistributedAttribute
 from .process_group import new_process_group
-from .utils import print_program_with_dist_attr
-from paddle.distributed.auto_parallel.completion import complete_backward_annotation, complete_update_annotation
+from .utils import set_dist_op_desc_original_id
+from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_recompute_op
+from .operators.common import BACKWARD_ONLY_DIST_OPS
 
 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
 
@@ -48,331 +43,147 @@ class Partitioner(object):
         2. partition var: if a var is sharded, modify the shape of var according to its shard annotation
 
     Partitioner is supposed to be call by the auto parallel framework, and not supposed to be directly called by user.
-
-    Example:
-        ....
-            import paddle.distributed.auto_parallel as auto
-            from paddle.fluid.distributed_attribute import get_default_distributed_context
-            from paddle.distributed import fleet
-            from paddle.distributed.auto_parallel.partitioner import Partitioner
-
-            # create serial program with forward only 
-            with static.program_guard(serial_main_program, serial_start_program):
-                model = create_model(config)
-                tokens = static.data(name="tokens", shape=[batch_size, sequence_len], dtype='int64')
-                labels = static.data(name="labels", shape=[batch_size, sequence_len], dtype='int64')
-                loss_mask = static.data(name="loss_mask", shape=[batch_size, sequence_len], dtype='int64')
-                preds = model(tokens)
-                loss = criterion(preds, labels, loss_mask)
-
-            # auto completion
-            auto.ProcessMesh(shape=[2, 4], process_group=[0, 1, 2, 3, 4, 5, 6, 7])
-            annotated_main_program = auto.complete_annotation(serial_main_program)
-            dist_context = get_default_distributed_context()
-                
-            # distributed strategy & rank info
-            rank_id = paddle.distributed.get_rank()
-            dist_strategy = fleet.DistributedStrategy()
-    
-            # create partitioner
-            Partitioner = Partitioner(dist_strategy, dist_context, rank_id)
-
-            # create dist program with forward only
-            # for distributed inference, using partitioned_main_prog from here
-            partitioned_main_prog, partitioned_startup_prog = Partitioner.transpile_forward(complete_train_program, start_program)
-
-            # create dist program with forward/backward/update
-            # for distributed training, using partitioned_main_prog from here
-            dist_params_grads = Partitioner.apply_backward(loss, complete_train_program, start_program, partitioned_main_prog, partitioned_startup_prog)
-            optimizer = paddle.fluid.optimizer.AdamOptimizer(
-                learning_rate=0.00001,
-                beta1=0.9,
-                beta2=0.999,
-                epsilon=1e-08,
-                grad_clip=None)
-            opt_ops = Partitioner.apply_optimize(optimizer, dist_params_grads, partitioned_main_prog, partitioned_startup_prog)
     """
 
-    def __init__(self, dist_strategy, dist_context, rank_id=0):
+    def __init__(self, dist_context, rank_id=0):
         """
         Args:
-            dist_strategy (paddle.fleet.distributed_strategy): used to determine the user defined distributed strategy.
             dist_context (paddle.fluid.DistributedContext): used to access the distributed_attr of var & op, every Partitioner object could maintain its own DistributedContext member, and partition program base on that shard scenario.
             rank_id (int): global rank id to which the partitioned distributed program belong.
         """
-
-        if not isinstance(dist_strategy, DistributedStrategy):
-            raise TypeError(
-                "dist_strategy be paddle.fleet.base.DistributedStrategy, got %s here"
-                % type(dist_strategy))
-
         if not isinstance(dist_context, DistributedContext):
             raise TypeError(
                 "dist_context be paddle.fluid.DistributedContext, got %s here" %
                 type(dist_context))
 
-        self._dist_strategy = dist_strategy
         self._dist_context = dist_context
         self._rank_id = rank_id
         self._serial2dist_varname_mapping = {}
         self._dist_varname_suffix = ""
 
-        # TODO if there is some dist op that is not compatible 
-        # with auto_backward in forward, the following flag 
-        # should be set to False
-        self._compatible_with_auto_backward = True
-
-    def transpile_forward(self, serial_main_program, serial_startup_program):
-        """
-        take serial forward programs with shard annotation, create a new distributed forward programs based on the serial ones.
-        instead of modify the input programs inplace, this function will preserve the inputs and create new program for output.
-
-        beside replace the serial op with its dist op, if user has defined other strategy in fleet.distributed_strategy, and if 
-        those strategy need to transpile (modify) the forward network program, those forward program modification should also be done within this
-        function in auto parallel scenario, in order to facilitate distributed inference/evaluation which need to DECOUPLE strategy specific forward transpilation with fleet.distributed_optimizer.minimize().
-
-        by now the fleet.distributed_strategy that need transpile forward program are following: 
-            1. (optimizer) sharding
-
-        Args:
-            main_program (paddle.fluid.framework.program): serial main program with forward network only
-            startup_program (paddle.fluid.framework.program): serial startup program with forward network only
-        
-        return:
-            main_program (paddle.fluid.framework.program): distributed main program with forward network only
-            startup_program (paddle.fluid.framework.program): distributed startup program with forward network only
-        """
-
-        dist_main_program, dist_startup_program = self.transpile_forward_impl(
-            serial_main_program, serial_startup_program)
-        return dist_main_program, dist_startup_program
-
-    def apply_backward(self,
-                       serial_loss,
-                       serial_main_program,
-                       serial_startup_program,
-                       dist_main_program,
-                       dist_startup_program,
-                       parameter_list=None,
-                       no_grad_set=None,
-                       callbacks=None):
-        """
-        A complete training neural network is made up of forward and backward propagation. 
-        This function is to generate the dist backward program for the distributed forward program.
-
-        By now, the current automatical backward mechanism in paddle framework might NOT handle the backward generation for 
-        some dist ops correctly, some so we now have two ways to genenate the backward program:
-            1. dist_forward_program --> auto_backward --> dist_backward_program (if auto_backward could handle all dist op)
-            2. serial_forward_program --> auto_backward --> serial_backward_program --> dist_op_backward_transpile --> dist_backward_program (if auto_backward could not handle all dist op)
-        
-        the backprogram is append the input dist program inplaced.
-
-        Args:
-            serial_loss (Variable) the loss in serial program that to be minimized 
-            serial_main_program (paddle.fluid.framework.program): serial main program with forward network only
-            serial_startup_program (paddle.fluid.framework.program): serial startup program with forward network only
-            dist_main_program (paddle.fluid.framework.program): dist main program with forward network only
-            dist_startup_program (paddle.fluid.framework.program): dist startup program with forward network only
-            parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update
-                to minimize ``loss``. The default value is None, at this time all parameters
-                will be updated.
-            no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
-                to be updated. The default value is None.
-            callbacks (list, optional): list of callable objects to run when appending backward
-                operator for one parameter. The default value is None.
-        
-        return:
-            params_grads (list) list of tuple that contain param and its grad variable
-        """
-        params_grads = self.apply_backward_impl(
-            serial_loss, serial_main_program, serial_startup_program,
-            dist_main_program, dist_startup_program)
-        return params_grads
-
-    def apply_optimize(self, user_define_optimizer, params_grads,
-                       dist_main_program, dist_startup_program):
-        """
-        append update related ops to the program: clip, weight decay, ops
-        filter optimize op if sharding is enable
-        naive gradient synchronization before update
-
-        Args:
-            user_define_optimizer (paddle.fluid.optimizer): 
-            params_grads (list) list of tuple that contain param and its grad variable
-            dist_main_program (paddle.fluid.framework.program): dist main program with forward & backward network 
-            dist_startup_program (paddle.fluid.framework.program): dist startup program with forward & backward  network 
-        """
-
-        optimize_ops = self.apply_optimize_impl(user_define_optimizer,
-                                                params_grads, dist_main_program,
-                                                dist_startup_program)
+    def partition(self, serial_main_program, serial_startup_program,
+                  params_grads):
 
-        return optimize_ops
-
-    def transpile_forward_impl(self, main_program, startup_program):
-
-        if not isinstance(main_program, (Program)):
+        if not isinstance(serial_main_program, (Program)):
             raise TypeError(
-                "dist_strategy be paddle.fluid.framework.program, got %s here" %
-                type(main_program))
-
-        if not isinstance(startup_program, (Program)):
-            raise TypeError(
-                "dist_context be paddle.fluid.framework.program, got %s here" %
-                type(startup_program))
+                "main_program be paddle.fluid.framework.program, got %s here" %
+                type(serial_main_program))
 
         # check if shard annotated serial program valid
-        if not self._is_valid_annotated_program(main_program):
+        if not self._is_valid_annotated_program(serial_main_program):
             raise RuntimeError(
                 "Not all vars or ops are annotated in main program !")
 
-        # dist op & partition vars
-        new_main_prog, new_startup_program = self._dist_var_op_forward_transpile(
-            main_program, startup_program)
-
-        # Sharding
-        if self._dist_strategy.sharding:
-            new_main_prog, new_startup_program = self._sharding_forward_transpile(
-                new_main_prog, new_startup_program)
-
-        return new_main_prog, new_startup_program
-
-    def apply_backward_impl(self,
-                            serial_loss,
-                            serial_main_program,
-                            serial_startup_program,
-                            dist_main_program,
-                            dist_startup_program,
-                            parameter_list=None,
-                            no_grad_set=None,
-                            callbacks=None):
-        """
-        """
-
-        params_grads = self._dist_var_op_backward_transpile(
-            serial_loss, serial_main_program, serial_startup_program,
-            dist_main_program, dist_startup_program)
-        # Sharding
-        if self._dist_strategy.sharding:
-            self._sharding_backward_transpile(new_main_prog,
-                                              new_startup_program)
-
-        return params_grads
-
-    def apply_optimize_impl(self, user_define_optimizer, params_grads,
-                            dist_main_program, dist_startup_program):
-        """
-        append update related ops to the program: clip, weight decay, ops
-        filter optimize op if sharding is enable
-        naive gradient synchronization before update
+        # init distop helper
+        dist_op_context = self._dist_context.dist_op_context
+        dist_op_context.set_varname_mapping(self._serial2dist_varname_mapping)
+        dist_op_context.set_rank_id(self._rank_id)
 
-        Args:
-            user_define_optimizer (paddle.fluid.optimizer): 
-            params_grads (list) list of tuple that contain param and its grad variable
-            dist_main_program (paddle.fluid.framework.program): dist main program with forward & backward network 
-            dist_startup_program (paddle.fluid.framework.program): dist startup program with forward & backward  network 
-        """
+        # partition startup program
+        if serial_startup_program == None:
+            partitioned_startup_prog = None
+        else:
+            partitioned_startup_prog = self.partition_startup_program(
+                serial_main_program, serial_startup_program)
+        dist_op_context.set_dst_startup_program(partitioned_startup_prog)
 
-        if self._dist_strategy.sharding:
-            params_grads = sharding_optimize_transpile(
-                params_grads, dist_main_program, dist_startup_program)
+        # partition main program 
+        partitioned_main_prog, partitioned_params_grads = self.partition_main_program(
+            serial_main_program, params_grads)
 
-        optimize_ops = self._optimize_transpile(user_define_optimizer,
-                                                params_grads, dist_main_program,
-                                                dist_startup_program)
+        return partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads
 
-        return optimize_ops
+    def partition_startup_program(self, serial_main_program,
+                                  serial_startup_program):
 
-    def _dist_var_op_forward_transpile(self,
-                                       serial_main_program,
-                                       serial_startup_program=None):
+        if not isinstance(serial_startup_program, (Program)):
+            raise TypeError(
+                "dist_context be paddle.fluid.framework.program, got %s here" %
+                type(serial_startup_program))
+
+        partitioned_startup_prog = fluid.Program()
+        ref_block = serial_main_program.global_block()
+        target_block = partitioned_startup_prog.global_block()
+        var2shape = {}
+        temp_varname_map = {}
+
+        # tensors
+        for var in serial_startup_program.list_vars():
+            assert var.persistable
+            new_name = var.name + self._dist_varname_suffix
+            temp_varname_map[var.name] = new_name
+            target_shape = _partition_var(self._dist_context, ref_block,
+                                          target_block, var.name, new_name)
+            var2shape[new_name] = target_shape
+
+        # ops
+        for op in serial_startup_program.global_block().ops:
+            # TODO if var not belong to this rank, should be filtered
+            output_vars = op.desc.output_arg_names()
+            assert len(
+                output_vars
+            ) == 1, "initializer should output only ONE variable, but got [{}]".format(
+                str(op.desc))
+            assert temp_varname_map[output_vars[
+                0]] in var2shape, "try to initialize [{}] which is not a persistable var".format(
+                    output_vars[0])
+            new_op_desc = target_block.desc.append_op()
+            new_op_desc.copy_from(op.desc)
+            new_op_desc._rename_output(output_vars[0],
+                                       temp_varname_map[output_vars[0]])
+            new_op_desc._set_attr("shape",
+                                  var2shape[temp_varname_map[output_vars[0]]])
+            target_block._sync_with_cpp()
+
+            # set distribute atrribute
+            new_op = target_block.ops[-1]
+            assert new_op.type == new_op_desc.type()
+            assert new_op.desc == new_op_desc
+            output_var = target_block.var(output_vars[0])
+            output_var_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                output_var)
+            op_attr = OperatorDistributedAttribute()
+            op_attr.process_mesh = output_var_attr.process_mesh
+            op_attr.set_output_dims_mapping(output_var.name,
+                                            output_var_attr.dims_mapping)
+            op_attr.set_input_dims_mapping(output_var.name,
+                                           output_var_attr.dims_mapping)
+            self._dist_context.set_op_dist_attr_for_program(new_op, op_attr)
+
+        return partitioned_startup_prog
+
+    def partition_main_program(self, serial_main_program, params_and_grads):
         """
         1. partition variables
         2. replace local op with corresponding dist op
         """
 
+        dist_op_context = self._dist_context.dist_op_context
         partitioned_main_prog = fluid.Program()
-        partitioned_global_block = partitioned_main_prog.global_block()
-        serial_main_block = serial_main_program.global_block()
+        dist_op_context.set_dst_main_program(partitioned_main_prog)
+        target_block = partitioned_main_prog.global_block()
+        ref_block = serial_main_program.global_block()
         serial_ops = serial_main_program.global_block().ops
 
-        # transpile startup program
-        if serial_startup_program == None:
-            partitioned_startup_prog = None
-        else:
-            partitioned_startup_prog = fluid.Program()
-            # create parameter
-            partitioned_startup_global_block = partitioned_startup_prog.global_block(
-            )
-            param2shape = {}
-            temp_varname_map = {}
-            for var in serial_startup_program.list_vars():
-                if isinstance(var, Parameter):
-                    # TODO if var not belong to this rank, should be filtered
-                    serial_main_var = serial_main_block.var(var.name)
-                    dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
-                        serial_main_var)
-                    target_shape = _get_dist_shape(serial_main_var, dist_attr)
-                    new_name = var.name + self._dist_varname_suffix
-                    temp_varname_map[var.name] = new_name
-                    _partition_parameter(self._dist_context, serial_main_var,
-                                         partitioned_startup_global_block,
-                                         new_name, target_shape)
-                    param2shape[new_name] = target_shape
-
-            # copy initializer
-            for op in serial_startup_program.global_block().ops:
-                # TODO if var not belong to this rank, should be filtered
-                output_vars = op.desc.output_arg_names()
-                assert len(
-                    output_vars
-                ) == 1, "initializer should output only ONE variable, but got [{}]".format(
-                    str(op.desc))
-                assert temp_varname_map[output_vars[
-                    0]] in param2shape, "try to initialize [{}] which is not a Parameter".format(
-                        output_vars[0])
-                new_op_desc = partitioned_startup_global_block.desc.append_op()
-                new_op_desc.copy_from(op.desc)
-                new_op_desc._rename_output(output_vars[0],
-                                           temp_varname_map[output_vars[0]])
-                new_op_desc._set_attr(
-                    "shape", param2shape[temp_varname_map[output_vars[0]]])
-                partitioned_startup_global_block._sync_with_cpp()
-
-                # set distribute atrribute
-                new_op = partitioned_startup_global_block.ops[-1]
-                assert new_op.type == new_op_desc.type()
-                assert new_op.desc == new_op_desc
-                output_var = partitioned_startup_global_block.var(output_vars[
-                    0])
-                output_var_attr = self._dist_context.get_tensor_dist_attr_for_program(
-                    output_var)
-                op_attr = OperatorDistributedAttribute()
-                op_attr.process_mesh = output_var_attr.process_mesh
-                op_attr.set_output_dims_mapping(output_var.name,
-                                                output_var_attr.dims_mapping)
-                op_attr.set_input_dims_mapping(output_var.name,
-                                               output_var_attr.dims_mapping)
-                self._dist_context.set_op_dist_attr_for_program(new_op, op_attr)
-
-        # TODO move helper init to a comm place
-        dist_op_context = self._dist_context.dist_op_context
-        dist_op_context.set_dst_main_program(partitioned_main_prog)
-        dist_op_context.set_dst_startup_program(partitioned_startup_prog)
-        dist_op_context.set_varname_mapping(self._serial2dist_varname_mapping)
-        dist_op_context.set_rank_id(self._rank_id)
+        # init mapping
+        first_backward_op_idx = -1
+        forward_op_id2forward_op = {}
+        for idx in range(len(serial_ops)):
+            if is_forward_op(serial_ops[idx]):
+                forward_op_id2forward_op[serial_ops[idx].desc.id(
+                )] = serial_ops[idx]
 
-        # transpile main program
+        # partiiton
         for op in serial_ops:
 
             # partititon input variables
             for serial_input_varname in op.desc.input_arg_names():
                 if serial_input_varname not in self._serial2dist_varname_mapping:
                     new_varname = serial_input_varname + self._dist_varname_suffix
-                    if serial_main_block.has_var(serial_input_varname):
-                        _partition_var(self._dist_context, serial_main_block,
-                                       partitioned_global_block,
-                                       serial_input_varname, new_varname)
+                    if ref_block.has_var(serial_input_varname):
+                        _partition_var(self._dist_context, ref_block,
+                                       target_block, serial_input_varname,
+                                       new_varname)
                     else:
                         assert serial_input_varname in __varname_not_in_block__
 
@@ -383,145 +194,46 @@ def _dist_var_op_forward_transpile(self,
             for serial_output_varname in op.desc.output_arg_names():
                 if serial_output_varname not in self._serial2dist_varname_mapping:
                     new_varname = serial_output_varname + self._dist_varname_suffix
-                    _partition_var(self._dist_context, serial_main_block,
-                                   partitioned_global_block,
+                    _partition_var(self._dist_context, ref_block, target_block,
                                    serial_output_varname, new_varname)
                     self._serial2dist_varname_mapping[
                         serial_output_varname] = new_varname
 
             # partition op
-            kinputs, koutputs = dist_op_context.prepare_forward_context(op)
-            dist_attr = self._dist_context.get_op_dist_attr_for_program(op)
-            if _is_dist_op_forward_implement(self._dist_context, op):
-                dist_ops = get_distributed_operator_impl_container(op.type)
-                dist_op_impl = dist_ops.get_impl(dist_attr.impl_idx)
-                dist_op_impl.forward(self._dist_context, **kinputs, **koutputs)
-
+            if is_forward_op(op):
+                kinputs, koutputs = dist_op_context.prepare_context(op)
+                dist_op_forward_impl = _get_dist_op_forward_implement(
+                    op, self._dist_context)
+                dist_op_forward_impl.forward(self._dist_context, **kinputs,
+                                             **koutputs)
+
+            elif is_backward_op(op):
+                kinputs, koutputs = dist_op_context.prepare_context(op)
+                dist_op_backward_impl = _get_dist_op_backward_implement(
+                    op, self._dist_context, forward_op_id2forward_op)
+                dist_op_backward_impl.backward(self._dist_context, **kinputs,
+                                               **koutputs)
             else:
-                # replicate op
-                dist_ops = get_distributed_operator_impl_container("default")
-                dist_op_impl = dist_ops.get_impl(0)
-                dist_op_impl.forward(self._dist_context, **kinputs, **koutputs)
-
-        return partitioned_main_prog, partitioned_startup_prog
-
-    def _dist_var_op_backward_transpile(self,
-                                        serial_loss,
-                                        serial_main_program,
-                                        serial_startup_program,
-                                        dist_main_program,
-                                        dist_startup_program,
-                                        parameter_list=None,
-                                        no_grad_set=None,
-                                        callbacks=None):
-        """
-        so far, the auto_backward case only guarantee the correcotness of backward ops for curtain Dist ops:
-            1. NV-Megatron-like parallel embedding
-            2. NV-Megatron-like row parallel linear
-            3. NV-Megatron-like col parallel linear
-        """
-
-        if self._compatible_with_auto_backward:
-            assert isinstance(
-                serial_loss, Variable), "The target loss should be an Variable."
-            dist_loss = self._serial_varname2dist_var(serial_loss.name,
-                                                      dist_main_program)
-
-            assert len(dist_loss.shape) == 1 and dist_loss.shape[0] == 1, \
-                "The dist loss.shape should be (1L,), but the current dist loss.shape is {}. " \
-                "Maybe that you should call fluid.layers.mean to process the current loss.".format(
-                    dist_loss.shape)
-
-            # update parameter list
-            if parameter_list:
-                parameter_list = [
-                    self._serial_varname2dist_var(param.name, dist_main_program)
-                    for param in parameter_list
-                ]
-
-            # update parameter no_grad_set
-            if no_grad_set:
-                no_grad_set = [
-                    self._serial_varname2dist_var(param.name, dist_main_program)
-                    for param in no_grad_set
-                ]
-
-            dist_op_context = self._dist_context.dist_op_context
-            params_and_grads = _auto_backward(
-                dist_loss,
-                dist_startup_program,
-                parameter_list=parameter_list,
-                no_grad_set=no_grad_set,
-                callbacks=callbacks,
-                distop_context=dist_op_context)
-
-            # backward completion 
-            complete_backward_annotation(
-                dist_main_program, dist_context=self._dist_context)
-
-            # transpiler backward for dist op
-            # get backward ops
-            ops = dist_main_program.global_block().ops
-            first_backward_op_idx = -1
-            forward_op_id2forward_op = {}
-            for idx in range(len(ops)):
-                if is_forward_op(ops[idx]):
-                    forward_op_id2forward_op[ops[idx].desc.id()] = ops[idx]
-
-                if int(ops[idx].attr('op_role')) == int(OpRole.Backward):
-                    first_backward_op_idx = idx
-                    break
-            assert first_backward_op_idx >= 0, "not found backward ops in program"
-            assert len(forward_op_id2forward_op
-                       ) > 0, "not found forward ops in program"
-
-            backward_ops = ops[first_backward_op_idx:]
-            for backward_op in backward_ops:
-                # if the backward op has a corresponding forward op
-                if backward_op.desc.id() in dist_op_context.gradopidx2opidx:
-                    forward_op_id = dist_op_context.gradopidx2opidx[
-                        backward_op.desc.id()]
-                    forward_op = forward_op_id2forward_op[forward_op_id]
-                    # TODO backward attr should has _impl_idx
-                    forward_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
-                        forward_op)
-                    # TODO use the backward op itself to find the dist op
-                    dist_ops = get_distributed_operator_impl_container(
-                        forward_op.type)
-                    kinputs, koutputs = dist_op_context.prepare_backward_context(
-                        backward_op)
-
-                    # TODO use backward op itself to determine impl idx
-                    if _is_dist_op_backward_implement(self._dist_context,
-                                                      forward_op):
-                        dist_op_impl = dist_ops.get_impl(
-                            forward_op_dist_attr.impl_idx)
-                        dist_op_impl.backward(self._dist_context, **kinputs,
-                                              **koutputs)
-                    else:
-                        # replicate op
-                        dist_ops = get_distributed_operator_impl_container(
-                            "default")
-                        dist_op_impl = dist_ops.get_impl(0)
-                        dist_op_impl.backward(self._dist_context, **kinputs,
-                                              **koutputs)
-
-            return params_and_grads
-        # replace dist grad ops
-        else:
-            raise RuntimeError("transpile NOT implemented !")
-
-    def _optimize_transpile(self, user_define_optimizer, params_grads,
-                            main_program, startup_program):
-
-        with program_guard(main_program, startup_program):
-            optimize_ops = user_define_optimizer.apply_gradients(params_grads)
-
-        # update completion 
-        complete_update_annotation(
-            main_program, dist_context=self._dist_context)
+                raise NotImplementedError(
+                    "partitioner only support forward op and backward op, but got {}".
+                    format(str(op)))
+
+        partitioned_params_and_grads = []
+        for p, g in params_and_grads:
+            assert p.name in self._serial2dist_varname_mapping
+            dist_p_name = self._serial2dist_varname_mapping[p.name]
+            assert target_block.has_var(dist_p_name)
+            dist_p = target_block.var(dist_p_name)
+            if g is None:
+                dist_g = None
+            else:
+                assert g.name in self._serial2dist_varname_mapping
+                dist_g_name = self._serial2dist_varname_mapping[g.name]
+                assert target_block.has_var(dist_g_name)
+                dist_g = target_block.var(dist_g_name)
+            partitioned_params_and_grads.append((dist_p, dist_g))
 
-        return optimize_ops
+        return partitioned_main_prog, partitioned_params_and_grads
 
     def _is_valid_annotated_program(self, program):
 
@@ -543,154 +255,6 @@ def _is_valid_annotated_program(self, program):
 
         return all_ops_annotated and all_vars_annotated
 
-    def _serial_varname2dist_var(self, serial_varname, dist_program):
-        assert serial_varname in self._serial2dist_varname_mapping, "The serial var [{}] is not found in var name mapping".format(
-            serial_varname)
-        dist_varname = self._serial2dist_varname_mapping[serial_varname]
-
-        assert dist_program.global_block().has_var(
-            dist_varname
-        ), "The dist var [{}] is not found in dist program".format(dist_varname)
-        dist_var = dist_program.global_block().var(dist_varname)
-
-        return dist_var
-
-    def _is_var_distributed(self, var):
-
-        dist_attr = self._dist_context.get_tensor_dist_attr_for_program(var)
-        assert dist_attr is not None, "dist_attr of var [{}] is None".format(
-            var.name)
-        return _is_distributed(dist_attr)
-
-    def _sharding_forward_transpile(self, main_prog, startup_program):
-        """
-        this transpile conduct the modification in forward program need by sharding strategy
-        which majorly include:
-            1. partition the parameter
-            2. insert broadcast op
-            3. insert sync op 
-
-        NOTE the transpile modification is inplace on the input program
-        """
-
-        raise NotImplementedError(
-            "Sharding is NOT support in AutoParallel yet!")
-
-    def _sharding_backward_transpile(self, main_prog, startup_program):
-        """
-        this transpile conduct the modification in backward program need by sharding strategy
-        which majorly include:
-            1. partition the gradient
-            2. insert broadcast op
-            3. insert sync op 
-
-        NOTE the transpile modification is inplace on the input program
-        """
-
-        raise NotImplementedError(
-            "Sharding is NOT support in AutoParallel yet!")
-
-    def _sharding_optimize_transpile(self, params_grads, dist_main_program,
-                                     dist_startup_program):
-        """
-        shard params_grads
-        append the broadcast to sync parameters 
-        """
-        raise RuntimeError("sharding transpile is NOT implemented !")
-
-
-def _get_no_grad_set_name(no_grad_set):
-    no_grad_set_name = set()
-    if no_grad_set is not None:
-        if isinstance(no_grad_set, (set, list, tuple)):
-            for i, no_grad_var in enumerate(no_grad_set):
-                if isinstance(no_grad_var, framework.Variable):
-                    no_grad_set_name.add(no_grad_var.name)
-                elif isinstance(no_grad_var, six.string_types):
-                    no_grad_set_name.add(no_grad_var)
-                else:
-                    raise TypeError(
-                        "The type of no_grad_set's member must be paddle.fluid.Variable or str, but received %s."
-                        % (type(no_grad_var)))
-        else:
-            raise TypeError(
-                "The type of no_grad_set should be set or list or tuple, but received {}".
-                format(type(no_grad_set)))
-    return no_grad_set_name
-
-
-def _get_no_grad_set(loss, no_grad_set=None):
-    no_grad_set = _get_no_grad_set_name(no_grad_set)
-    parameters = loss.block.program.global_block().all_parameters()
-    param_no_trainable = set(
-        [param.name for param in parameters if param.trainable is False])
-    # If the parameter is no trainable, it should not have a gradient.
-    no_grad_set.update(param_no_trainable)
-
-    return no_grad_set
-
-
-def _is_dist_op_forward_implement(dist_context, op):
-    dist_attr = dist_context.get_op_dist_attr_for_program(op)
-    dist_ops = get_distributed_operator_impl_container(op.type)
-
-    return dist_ops and dist_attr.impl_idx >= 0 and dist_ops.get_impl( \
-        dist_attr.impl_idx)._forward_implemented
-
-
-def _is_dist_op_backward_implement(dist_context, op):
-    dist_attr = dist_context.get_op_dist_attr_for_program(op)
-    dist_ops = get_distributed_operator_impl_container(op.type)
-
-    return dist_ops and dist_attr.impl_idx >= 0 and dist_ops.get_impl( \
-        dist_attr.impl_idx)._backward_implemented
-
-
-def _auto_backward(loss,
-                   startup_program=None,
-                   parameter_list=None,
-                   no_grad_set=None,
-                   callbacks=None,
-                   distop_context=None):
-    """
-    modification is inplaced
-    """
-    act_no_grad_set = _get_no_grad_set(loss, no_grad_set)
-    assert isinstance(loss, Variable), "The target loss should be an Variable."
-
-    if callbacks is None:
-        callbacks = [error_clip_callback]
-    else:
-        assert (isinstance(callbacks, list))
-
-    assert len(loss.shape) == 1 and loss.shape[0] == 1, \
-        "The loss.shape should be (1L,), but the current loss.shape is {}. " \
-        "Maybe that you should call fluid.layers.mean to process the current loss.".format(
-            loss.shape)
-
-    program = loss.block.program
-
-    with program_guard(program, startup_program):
-        params_grads = append_backward(
-            loss,
-            parameter_list,
-            act_no_grad_set,
-            callbacks,
-            distop_context=distop_context)
-
-    return params_grads
-
-
-def _is_distributed(dist_attr):
-
-    mapping = dist_attr.dims_mapping
-    mesh = dist_attr.process_mesh.topology
-    for idx in range(len(mapping)):
-        if mapping[idx] >= 0 and mesh[mapping[idx]] > 1:
-            return True
-
-    return False
-
 
 def _get_dist_shape(var, dist_attr):
 
@@ -783,6 +347,7 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
             name=dst_varname,
             persistable=True,
             stop_gradient=True)
+        target_shape = None
     else:
         dist_attr = dist_context.get_tensor_dist_attr_for_program(src_var)
         target_shape = _get_dist_shape(src_var, dist_attr)
@@ -793,54 +358,44 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
         else:
             _partition_intermediate_var(dist_context, src_var, dst_block,
                                         dst_varname, target_shape)
+    return target_shape
+
 
+def _get_dist_op_backward_implement(backward_op, dist_context,
+                                    forward_op_id2forward_op):
+    dist_op_context = dist_context.dist_op_context
+    if backward_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+        forward_op_id = dist_op_context.grad_op_id_to_op_id[backward_op.desc.id(
+        )]
+        forward_op = forward_op_id2forward_op[forward_op_id]
+        forward_op_dist_attr = dist_context.get_op_dist_attr_for_program(
+            forward_op)
+        dist_op = get_distributed_operator_impl_container(forward_op.type)
 
-def _insert_src_op(src_op, dst_block, varname_mapping):
-
-    new_op_desc = dst_block.desc.append_op()
-    new_op_desc.copy_from(src_op.desc)
-    for local_varname in src_op.desc.input_arg_names():
-        new_op_desc._rename_input(local_varname, varname_mapping[local_varname])
-    for local_varname in src_op.desc.output_arg_names():
-        new_op_desc._rename_output(local_varname,
-                                   varname_mapping[local_varname])
-    dst_block._sync_with_cpp()
-
-
-def _insert_dist_op(src_op, dst_block, varname_mapping, dist_context, rank_id):
-
-    # build input varname mapping
-    input_mapping = {}
-    for input_name in src_op.desc.input_names():
-        varnames = []
-        for varname in src_op.desc.input(input_name):
-            varnames.append(varname_mapping[varname])
-        input_mapping[input_name] = varnames
-
-    # build output varname mapping
-    output_mapping = {}
-    for output_name in src_op.desc.output_names():
-        varnames = []
-        for varname in src_op.desc.output(output_name):
-            varnames.append(varname_mapping[varname])
-        output_mapping[output_name] = varnames
-
-    # append dist op 
-    dist_attr = dist_context.get_op_dist_attr_for_program(src_op)
-    dist_ops = get_distributed_operator_impl_container(src_op.type)
-    append_op_handle = dist_ops.get_impl(dist_attr.impl_idx).forward(src_op)
-    append_op_handle(
-        dst_block,
-        src_op,
-        dist_attr,
-        input_mapping,
-        output_mapping,
-        rank_id=rank_id)
-
-
-def is_forward_op(op):
-    role1 = int(core.op_proto_and_checker_maker.OpRole.Forward) | int(
-        core.op_proto_and_checker_maker.OpRole.Loss)
-    role2 = int(core.op_proto_and_checker_maker.OpRole.Forward)
-    op_role = int(op.attr('op_role'))
-    return op_role == role2 or op_role == role1
+        # TODO backward should have its own impl_idx
+        if dist_op and forward_op_dist_attr.impl_idx >= 0 and dist_op.get_impl( \
+            forward_op_dist_attr.impl_idx)._backward_implemented:
+            return dist_op.get_impl(forward_op_dist_attr.impl_idx)
+
+    # NOTE trick for dist ops that only have backward implement 
+    if backward_op.type in BACKWARD_ONLY_DIST_OPS:
+        op_dist_attr = dist_context.get_op_dist_attr_for_program(backward_op)
+        assert op_dist_attr.impl_idx >= 0
+        return get_distributed_operator_impl_container(
+            backward_op.type).get_impl(op_dist_attr.impl_idx)
+
+    dist_op = get_distributed_operator_impl_container("default")
+    return dist_op.get_impl(0)
+
+
+def _get_dist_op_forward_implement(forward_op, dist_context):
+    dist_attr = dist_context.get_op_dist_attr_for_program(forward_op)
+    dist_op = get_distributed_operator_impl_container(forward_op.type)
+
+    if dist_op and dist_attr.impl_idx >= 0 and dist_op.get_impl(
+            dist_attr.impl_idx)._forward_implemented:
+        return dist_op.get_impl(dist_attr.impl_idx)
+
+    else:
+        dist_op = get_distributed_operator_impl_container("default")
+        return dist_op.get_impl(0)
diff --git a/python/paddle/distributed/auto_parallel/planner.py b/python/paddle/distributed/auto_parallel/planner.py
old mode 100644
new mode 100755
index 7c4ce0b243506..1dfefb41c80a3
--- a/python/paddle/distributed/auto_parallel/planner.py
+++ b/python/paddle/distributed/auto_parallel/planner.py
@@ -386,15 +386,20 @@ def search(self):
 
 
 class MCMC(SearchAlgorithm):
-    def __init__(self, serial_program_info, max_search_times=5):
+    def __init__(self, serial_program_info, parallelizer, max_search_times=5):
         super(MCMC, self).__init__("mcmc")
         self._serial_program_info = serial_program_info
         self._max_search_times = max_search_times
+        self._parallelizer = parallelizer
 
     @property
     def serial_program_info(self):
         return self._serial_program_info
 
+    @property
+    def parallelizer(self):
+        return self._parallelizer
+
     @property
     def max_search_times(self):
         return self._max_search_times
@@ -483,7 +488,7 @@ def estimate_searched_strategy_cost(self,
         cost = None
         # get all distributed programs
         all_dist_main_program = get_all_distributed_main_program(
-            self.serial_program_info, dist_context)
+            self.serial_program_info, dist_context, self.parallelizer)
         pipeline_config = [
             process_mesh.processes for process_mesh in pipeline_process_meshes
         ] if pipeline_process_meshes is not None else None
@@ -829,8 +834,10 @@ def search(self):
 
 
 class Planner:
-    def __init__(self, serial_program_info, algorithm_config=None):
+    def __init__(self, serial_program_info, parallelizer,
+                 algorithm_config=None):
         self._serial_program_info = serial_program_info
+        self._parallelizer = parallelizer
         self._algorithm_config = algorithm_config
         self._algorithm_searcher = self.create_algorithm_searcher(
             algorithm_config)
@@ -847,6 +854,10 @@ def algorithm_config(self):
     def algorithm_searcher(self):
         return self._algorithm_searcher
 
+    @property
+    def parallelizer(self):
+        return self._parallelizer
+
     def create_algorithm_searcher(self, algorithm_config):
         name = algorithm_config.get("name", None)
         assert name is not None, "Invalid algorithm config."
@@ -856,9 +867,9 @@ def create_algorithm_searcher(self, algorithm_config):
             # NOTE: Only GPU clusters are supported now.
             max_search_times = algorithm_config.get("max_search_times", None)
             algorithm_searcher = MCMC(
-                self.serial_program_info,
+                self.serial_program_info, self.parallelizer,
                 max_search_times) if max_search_times is not None else MCMC(
-                    self.serial_program_info)
+                    self.serial_program_info, self.parallelizer)
         else:
             raise NotImplementedError(
                 "Other search algorithms have not been supported now.")
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
old mode 100755
new mode 100644
index 3b392d4e088de..2316f207ffe8e
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -25,6 +25,7 @@
 from paddle.framework.io import _to_LodTensor
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from paddle.fluid.io import is_parameter, is_belong_to_optimizer
+from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
 
 
 def is_valid_list_index(list, index):
@@ -993,26 +994,42 @@ def set_grad_var_shape(program, dist_context):
     block = program.global_block()
     vars = block.vars
     for op in block.ops:
-        if op.type == "sum":
+
+        if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
+            break
+
+        if op.type in ["sum"]:
             continue
         if int(op.attr('op_role')) == int(OpRole.Backward):
             op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
             assert op_dist_attr is not None
 
             for var_name in op.output_arg_names:
+
                 assert "@GRAD" in var_name
                 forward_var_name = var_name[:var_name.find("@GRAD")]
-                if op.type == "c_allreduce_sum" or op.type == "c_identity" or op.type == "scale":
+                if op.type in [
+                        "c_allreduce_sum", "c_identity", "scale", "cast"
+                ]:
                     forward_var_name = op.input_arg_names[0]
+                elif op.type == "matmul_v2_grad":
+                    forward_var_name = None
+                    for output_name in op.output_names:
+                        if var_name in op.output(output_name):
+                            assert "@GRAD" in output_name
+                            input_name = output_name[:output_name.find("@GRAD")]
+                            assert len(op.input(input_name)) == 1
+                            forward_var_name = op.input(input_name)[0]
+                    assert forward_var_name is not None
 
                 need_set_shape_list = [
                     "reshape2_grad", "softmax_with_cross_entropy_grad",
                     "transpose2_grad", "softmax_grad", "cross_entropy_grad2",
-                    "dropout_grad", "unsqueeze2_grad"
+                    "dropout_grad"
                 ]
                 forward_list = [
                     "reshape2", "softmax_with_cross_entropy", "transpose2",
-                    "softmax", "cross_entropy2", "dropout", "unsqueeze2"
+                    "softmax", "cross_entropy2", "dropout"
                 ]
                 if op.type in need_set_shape_list:
                     for forward_op in block.ops:
@@ -1027,6 +1044,7 @@ def set_grad_var_shape(program, dist_context):
 
                 forward_input_dist_attr = op_dist_attr.get_input_dist_attr(
                     forward_var_name)
+
                 assert forward_input_dist_attr is not None, f"{forward_var_name}"
                 forward_var = vars[forward_var_name]
                 forward_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
@@ -1041,6 +1059,70 @@ def set_grad_var_shape(program, dist_context):
                     grad_var.desc.set_shape(ref_shape)
 
 
+OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+OpRole = core.op_proto_and_checker_maker.OpRole
+
+
+def is_forward_op(op):
+    ref_role1 = int(core.op_proto_and_checker_maker.OpRole.Forward)
+    ref_role2 = int(core.op_proto_and_checker_maker.OpRole.Loss)
+    op_role = int(op.attr('op_role'))
+    return OP_ROLE_KEY in op.attr_names and (op_role == ref_role1 or
+                                             op_role == ref_role2)
+
+
+def is_backward_op(op):
+    return OP_ROLE_KEY in op.attr_names and \
+            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Backward)
+
+
+def is_recompute_op(op):
+    return OP_ROLE_KEY in op.attr_names and \
+            int(op.all_attrs()[OP_ROLE_KEY]) == 9
+
+
+def is_loss_op(op):
+    return OP_ROLE_KEY in op.attr_names and \
+        int(op.all_attrs()[OP_ROLE_KEY]) == (int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Loss))
+
+
+def get_loss_op(block):
+    loss_ops = []
+    for op in block.ops:
+        if is_loss_op(op):
+            assert len(op.desc.output_arg_names(
+            )) == 1, "loss op should only output loss var"
+            loss_ops.append(op)
+
+    assert len(loss_ops) == 1, "num of loss op is not equal to one"
+    return loss_ops[0]
+
+
+def set_var_dist_attr(dist_context, var, dims_mapping, process_mesh, **kwargs):
+    tensor_dist_attr = TensorDistributedAttribute()
+    tensor_dist_attr.dims_mapping = dims_mapping
+    # TODO get global mesh group
+    tensor_dist_attr.process_mesh = process_mesh
+    dist_context.set_tensor_dist_attr_for_program(var, tensor_dist_attr)
+    return tensor_dist_attr
+
+
+def naive_set_dist_op_attr_for_program_by_mesh_and_mapping(new_op, process_mesh,
+                                                           ref_mapping, ctx):
+    assert process_mesh is not None
+    assert ref_mapping is not None
+
+    new_op_dist_attr = OperatorDistributedAttribute()
+
+    for input_varname in new_op.desc.input_arg_names():
+        new_op_dist_attr.set_input_dims_mapping(input_varname, ref_mapping)
+    for output_varname in new_op.desc.output_arg_names():
+        new_op_dist_attr.set_output_dims_mapping(output_varname, ref_mapping)
+
+    new_op_dist_attr.process_mesh = process_mesh
+    ctx.set_op_dist_attr_for_program(new_op, new_op_dist_attr)
+
+
 def update_op_dims_mapping_by_default_dist_impl(dist_op):
     changed = False
     op_dist_attr = dist_op.dist_attr
@@ -1177,57 +1259,25 @@ def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_op):
     return changed
 
 
-def get_all_distributed_main_program(serial_program_info, dist_context):
+def get_all_distributed_main_program(serial_program_info, dist_context,
+                                     parallelizer):
     "Get all distributed main programs by dist_context."
-    from .dist_context import DistributedOperatorContext
+    from .dist_context import DistributedOperatorContext, DistributedContext
     cluster = serial_program_info.cluster
+    copied_parallelizer = copy.deepcopy(parallelizer)
     all_dist_main_program = []
     ranks = paddle.distributed.get_world_size() if cluster is None else len(
         cluster.get_all_devices("GPU"))
     for rank_id in range(ranks):
         used_dist_context = copy.deepcopy(dist_context)
         used_dist_context._dist_op_context = DistributedOperatorContext()
-        dist_main_program, dist_startup_program = get_specified_distributed_main_program(
-            serial_program_info, used_dist_context, rank_id)
+        _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program(
+            rank_id, used_dist_context)
         all_dist_main_program.append(dist_main_program)
 
     return all_dist_main_program
 
 
-def get_specified_distributed_main_program(serial_program_info, dist_context,
-                                           rank_id):
-    "Get distributed main program by the given dist_context and rank_id."
-    from .partitioner import Partitioner
-    from .reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER
-    from .process_group import _g_process_group_map, ProcessGroup
-
-    dist_strategy = paddle.distributed.fleet.DistributedStrategy()
-    train_program = serial_program_info.train_program
-    startup_program = serial_program_info.startup_program
-    loss = serial_program_info.loss
-    optimizer = serial_program_info.optimizer
-
-    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
-    dist_main_program, dist_startup_program = partitioner.transpile_forward(
-        train_program, startup_program)
-    dist_params_grads = partitioner.apply_backward(
-        loss, train_program, startup_program, dist_main_program,
-        dist_startup_program)
-    opt_ops = partitioner.apply_optimize(
-        copy.deepcopy(optimizer), dist_params_grads, dist_main_program,
-        dist_startup_program)
-    set_grad_var_shape(dist_main_program, dist_context)
-    make_data_unshard(dist_main_program, dist_startup_program, dist_context)
-    reshard(dist_main_program, dist_startup_program, rank_id, dist_context)
-    HAS_SENT.clear()
-    HAS_RECV.clear()
-    HAS_ALLGATHER.clear()
-
-    _g_process_group_map.clear()
-    _g_process_group_map[0] = ProcessGroup(0, [])
-    return dist_main_program, dist_startup_program
-
-
 class SerialProgramInfo:
     def __init__(self,
                  train_program,
@@ -1286,7 +1336,6 @@ def _compute_runtime(op_cost, op, vars):
                 shape = list(map(lambda x: int(x.strip()), shape))
                 dtype_factor = 1
                 total_static_input_size += reduce(lambda x, y: x * y, shape)
-                # print(arg_name_lower)
                 if op.type == "c_embedding":
                     arg_name_lower = "w" if arg_name_lower == "weight" else "ids"
                 for arg_name in op.input_names:
@@ -1301,7 +1350,8 @@ def _compute_runtime(op_cost, op, vars):
         actual_runtime = total_actual_input_size / total_static_input_size * runtime
         return actual_runtime
 
-    cost_model = paddle.cost_model.CostModel()
+    import paddle.cost_model as cm
+    cost_model = cm.CostModel()
     cost_model.static_cost_data()
     DEFAULT_MULTIPLE = 2
     OP_NAME_MAPPING = {
@@ -1352,3 +1402,19 @@ def _compute_runtime(op_cost, op, vars):
         standalone_cost_data.append(cost_data)
 
     return standalone_cost_data
+
+
+def set_dist_op_desc_original_id(dist_op_desc, op_desc, dist_context):
+    op_id = op_desc.id()
+    op_original_id = op_desc.original_id()
+    # First, try to set the original id to the id of the op_desc
+    if op_id in dist_context._dist_ops_for_program:
+        dist_op_desc.set_original_id(op_id)
+        return
+    # Second, try to set the original id to the original_id of the op_desc
+    elif op_original_id in dist_context._dist_ops_for_program:
+        dist_op_desc.set_original_id(op_original_id)
+        return
+    # Third, print error infomation if we cannot find the original id
+    else:
+        assert False, "Cannot find the original id in the distributed context"
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 14a411ae25356..3731332d1e777 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -1219,6 +1219,7 @@ def _parallel_linear(x,
             inputs={'X': linear_out},
             outputs={'Out': out},
             attrs={
+                'rank': inner_rank,
                 'ring_id': ring_id,
                 'nranks': nranks,
                 'use_calc_stream': True,
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index c19ee1e192761..bc59b87e2ffa5 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -627,6 +627,8 @@ def init_server(self, *args, **kwargs):
         """
         self._runtime_handle._init_server(*args, **kwargs)
 
+    @is_non_distributed_check
+    @inited_runtime_handler
     def load_model(self, path, mode):
         """
         load fleet model from path
@@ -699,6 +701,8 @@ def stop_worker(self):
         """
         self._runtime_handle._stop_worker()
 
+    @is_non_distributed_check
+    @inited_runtime_handler
     def save(self, dirname, feed=[], fetch=[], **configs):
         inference = True
 
@@ -742,6 +746,8 @@ def save(self, dirname, feed=[], fetch=[], **configs):
             self._runtime_handle._save_persistables(
                 executor, dirname, main_program=None, mode=increment_mode)
 
+    @is_non_distributed_check
+    @inited_runtime_handler
     def save_inference_model(self,
                              executor,
                              dirname,
@@ -777,6 +783,8 @@ def save_inference_model(self,
             executor, dirname, feeded_var_names, target_vars, main_program,
             export_for_deployment, mode)
 
+    @is_non_distributed_check
+    @inited_runtime_handler
     def save_persistables(self, executor, dirname, main_program=None, mode=0):
         """
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 8b75c57fab407..52468ab533496 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -53,6 +53,7 @@ def __init__(self, optimizer):
             "AMPOptimizer",
             "LarsOptimizer",
             "LambOptimizer",
+            "ASPOptimizer",
             # "ModelParallelOptimizer",
             # "PipelineOptimizer",
         ]
diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py
index 55c90abf142e7..87454d8842497 100644
--- a/python/paddle/distributed/passes/__init__.py
+++ b/python/paddle/distributed/passes/__init__.py
@@ -14,6 +14,8 @@
 
 from .pass_base import new_pass, PassManager, PassContext
 from .fuse_all_reduce import *
+from .auto_parallel_gradient_merge import *
+from .auto_parallel_sharding import *
 from .cpp_pass import *
 
 __all__ = [
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
new file mode 100644
index 0000000000000..310358436ae32
--- /dev/null
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from collections import OrderedDict
+from typing import List, Tuple, Dict, Any
+
+import paddle
+from paddle.framework import core
+from paddle.fluid.framework import program_guard, device_guard
+from paddle.fluid import unique_name, layers
+from paddle.fluid.clip import append_gradient_clip_ops
+from .pass_base import PassBase, PassType, register_pass
+
+
+def _is_the_backward_op(op):
+    OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    OpRole = core.op_proto_and_checker_maker.OpRole
+    return OP_ROLE_KEY in op.attr_names and \
+            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Backward)
+
+
+def _is_the_optimizer_op(op):
+    OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    OpRole = core.op_proto_and_checker_maker.OpRole
+    return OP_ROLE_KEY in op.attr_names and \
+            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize)
+
+
+def _remove_and_get_optimizer_op(main_program, dist_context):
+    # 1 create tmp block
+    # 2 mv optimizer op from global program to tmp block
+    # 3 del the op from dist_context
+    from paddle.distributed.fleet.meta_optimizers.common import OpRole
+    main_block = main_program.global_block()
+    temp_block = main_program._create_block()
+    removed_op_idx = []
+    optimize_ops_desc = []
+    for idx, op in enumerate(main_block.ops):
+        if _is_the_optimizer_op(op):
+            # append optimizer op to tmp block
+            new_op_desc = temp_block.desc.append_op()
+            new_op_desc.copy_from(op.desc)
+            optimize_ops_desc.append(new_op_desc)
+            removed_op_idx.append(idx)
+
+            # del op from dist_context
+            if dist_context:
+                dist_context.del_dist_op_for_program(op)
+
+    for idx in removed_op_idx[::-1]:
+        main_block._remove_op(idx)
+
+    return optimize_ops_desc
+
+
+def _remove_op_role_var(param, grad):
+    op_maker = core.op_proto_and_checker_maker
+    op = grad.op
+    assert _is_the_backward_op(op), \
+        'grad.op={} is not the backward op which produces the grad={}' \
+        .format(op, grad.name)
+
+    if op.has_attr(op_maker.kOpRoleVarAttrName()):
+        op._remove_attr(op_maker.kOpRoleVarAttrName())
+
+
+def _get_gm_cond_var(main_program, k_steps):
+    main_block = main_program.global_block()
+    # Add const var
+    k_step_var = layers.create_global_var(
+        name="gradient_merge_k",
+        shape=[1],
+        value=int(k_steps),
+        dtype='int32',
+        persistable=True,
+        force_cpu=True)
+
+    zero_var = layers.create_global_var(
+        name="gradient_merge_zero",
+        shape=[1],
+        value=int(0),
+        dtype='int32',
+        persistable=True,
+        force_cpu=True)
+
+    # Add step var & cond var
+    step_var = layers.create_global_var(
+        name="gradient_merge_step",
+        shape=[1],
+        value=int(0),
+        dtype='int32',
+        persistable=True,
+        force_cpu=True)
+
+    cond_var = layers.create_global_var(
+        name="gradient_merge_cond",
+        shape=[1],
+        value=bool(0),
+        dtype='bool',
+        persistable=False,
+        force_cpu=True)
+
+    with device_guard("cpu"):
+        # step_var = (step_var + 1) % k_step
+        layers.increment(x=step_var, value=1.0, in_place=True)
+        main_block.append_op(
+            type='elementwise_mod',
+            inputs={'X': step_var,
+                    'Y': k_step_var},
+            outputs={'Out': step_var},
+            attrs={'axis': -1,
+                   'use_mkldnn': False})
+
+        # cond_var = (step_var == 0)
+        main_block.append_op(
+            type='equal',
+            inputs={'X': step_var,
+                    'Y': zero_var},
+            outputs={'Out': cond_var})
+
+    return cond_var
+
+
+def _append_gradient_merge_backward_op(
+        main_program,
+        startup_program,
+        params_grads: List[Tuple[Any, Any]],
+        cond_var_name: str) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]:
+    main_block = main_program.global_block()
+    startup_block = startup_program.global_block()
+
+    # step1: remove grad.op's op_role_var
+    for param, grad in params_grads:
+        assert (
+            param.type != core.VarDesc.VarType.SELECTED_ROWS
+        ), "SELECTED_ROWS is not supported in GradientMergeOptimizer for now"
+
+        _remove_op_role_var(param, grad)
+
+    param_to_gradient_merge = {}
+    new_params_to_grads = []
+    # step2: create gradient_merge var and init with 0
+    for param, grad in params_grads:
+        param_name = param.name
+        param_var = main_block.var(param_name)
+        assert (param_var is not None)
+        gradient_merge_var = main_block.create_var(
+            name=param_name + "@GRAD@GradientMerge",
+            shape=param_var.shape,
+            dtype=param_var.dtype,
+            persistable=True)
+        param_to_gradient_merge[param_name] = gradient_merge_var
+
+        startup_gradient_merge_var = startup_block.create_var(
+            name=param_name + "@GRAD@GradientMerge",
+            shape=param_var.shape,
+            dtype=param_var.dtype,
+            persistable=True)
+        startup_block.append_op(
+            type="fill_constant",
+            outputs={"Out": startup_gradient_merge_var},
+            attrs={
+                "shape": param_var.shape,
+                "dtype": param_var.dtype,
+                "value": float(0),
+            })
+
+        # grad_merge += grad
+        new_grad_op = main_block.append_op(
+            type="elementwise_add",
+            inputs={'X': grad,
+                    'Y': gradient_merge_var},
+            outputs={'Out': gradient_merge_var},
+            attrs={'axis': -1,
+                   'use_mkldnn': False})
+        new_params_to_grads.append([param, gradient_merge_var])
+    return new_params_to_grads, param_to_gradient_merge
+
+
+def _create_cond_block_and_update_optimizer(
+        main_program,
+        cond_var,
+        new_params_to_grads: List[Tuple[Any, Any]],
+        param_to_gradient_merge: Dict[str, Any],
+        optimize_ops_desc: List[Any],
+        k_steps,
+        avg):
+    def true_apply_gradient():
+        cur_block_idx = main_program.current_block_idx
+        cur_block = main_program.current_block()
+
+        # cur_block's forward_block & backward_block is itself
+        cur_block._set_forward_block_idx(cur_block_idx)
+        op_maker = core.op_proto_and_checker_maker
+        if avg:
+            for param, new_grad in new_params_to_grads:
+                # grad /= k_steps
+                cur_block.append_op(
+                    type='scale',
+                    inputs={'X': new_grad},
+                    outputs={'Out': new_grad},
+                    attrs={
+                        'scale': 1.0 / k_steps,
+                        'bias': 0.0,
+                        'bias_after_scale': False
+                    })
+                new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
+                                      op_maker.OpRole.Optimize)
+
+        # append optimizer ops
+        for op_desc in optimize_ops_desc:
+            new_op_desc = cur_block.desc.append_op()
+            new_op_desc.copy_from(op_desc)
+
+            #update input/output
+            for input_name in new_op_desc.input_arg_names():
+                if input_name in new_params_to_grads:
+                    new_op_desc._rename_input(input_name,
+                                              new_params_to_grads[input_name])
+
+            for output_name in new_op_desc.output_arg_names():
+                if output_name in new_params_to_grads:
+                    new_op_desc._rename_output(output_name,
+                                               new_params_to_grads[output_name])
+
+            # remove op_role_var
+            if new_op_desc.has_attr(op_maker.kOpRoleVarAttrName()):
+                new_op_desc.remove_attr(op_maker.kOpRoleVarAttrName())
+
+            # op's update Grad
+            if new_op_desc.input("Grad"):
+                grad_value = new_op_desc.input("Grad")[0]
+                # TODO FIXME(xym) support fp16
+                grad_merge_value = grad_value + '@GradientMerge'
+                new_op_desc.set_input("Grad", [grad_merge_value])
+
+        main_program.global_block()._sync_with_cpp()
+        cur_block._sync_with_cpp()
+
+        # clear gradient_merge_vars
+        for param, new_grad in new_params_to_grads:
+            layers.fill_constant(
+                shape=new_grad.shape,
+                dtype=new_grad.dtype,
+                value=0.0,
+                out=new_grad)
+            new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
+                                  op_maker.OpRole.Optimize)
+
+    layers.cond(cond_var, true_fn=true_apply_gradient, false_fn=None)
+
+
+def parse_program(main_program, startup_program, params_grads, k_steps, avg,
+                  dist_context):
+    # 1 create gradient_merge_cond
+    cond_var = _get_gm_cond_var(main_program, k_steps)
+
+    # 2 remove optimizer_op from main_program
+    optimize_ops_desc = _remove_and_get_optimizer_op(main_program, dist_context)
+
+    # back to block 0
+    main_program._rollback()
+
+    # 3 append gradient merge backward op to main_program
+    new_params_to_grads, param_to_gradient_merge = _append_gradient_merge_backward_op(
+        main_program, startup_program, params_grads, cond_var.name)
+
+    # 4 create ConditionalBlock and append gradient merge optimizer ops
+    _create_cond_block_and_update_optimizer(
+        main_program, cond_var, new_params_to_grads, param_to_gradient_merge,
+        optimize_ops_desc, k_steps, avg)
+
+
+@register_pass("auto_parallel_gradient_merge_pass")
+class GradientMergePass(PassBase):
+    def __init__(self):
+        super(GradientMergePass, self).__init__()
+        self.set_attr("k_steps", -1)
+        self.set_attr("avg", True)
+        self.set_attr("inner_optimizer", None)
+
+    def _check_self(self):
+        if self.get_attr("k_steps") < 1:
+            return False
+        return True
+
+    def _check_conflict(self, other_pass):
+        return True
+
+    def _type(self):
+        return PassType.COMM_OPT
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        k_steps = self.get_attr("k_steps", -1)
+        avg = self.get_attr("avg", False)
+        dist_context = self.get_attr("dist_context")
+        params_grads = self.get_attr("params_grads")
+        with paddle.static.program_guard(main_program, startup_program):
+            parse_program(main_program, startup_program, params_grads, k_steps,
+                          avg, dist_context)
+
+        main_program._sync_with_cpp()
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
new file mode 100644
index 0000000000000..2785eae6e8a46
--- /dev/null
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -0,0 +1,700 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import reduce
+from collections import OrderedDict
+import numpy as np
+
+import paddle
+from paddle.framework import core
+from paddle.fluid import unique_name
+from .pass_base import PassBase, register_pass
+from paddle.distributed.fleet.meta_optimizers.common import is_backward_op, is_optimizer_op
+from paddle.distributed.auto_parallel.process_group import get_world_process_groups, new_process_group
+from paddle.distributed.auto_parallel.operators.common import is_parameter_related
+from paddle.distributed.auto_parallel.utils import _get_comm_group, naive_set_dist_op_attr_for_program_by_mesh_and_mapping, set_var_dist_attr
+
+OpRole = core.op_proto_and_checker_maker.OpRole
+OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+_skip_ops = ['create_py_reader', 'create_double_buffer_reader', 'read', 'slice']
+# update here to support new optimizers
+_supported_optimizer_type = [
+    "adam", "adamax", "adamw", "decayed_adagrad", "momentum", "dgc_momentum",
+    "lars_momentum", "merged_momentum", "lamb", "sgd"
+]
+
+
+# NOTE we add the "auto_parallel" prefix to the pass in order to
+# indicate that this pass should obey some constrains by auto_parallel
+# for example all ops and vars should has dist attr before and after pass
+# should use dist op instead of custom comm op 
+@register_pass("auto_parallel_sharding")
+class ShardingPass(PassBase):
+    def __init__(self):
+        super(ShardingPass, self).__init__()
+        self.set_attr("dist_context", None)
+        self.set_attr("stage", None)
+        self.set_attr("sharding_degree", None)
+        self.set_attr("params_grads", [])
+        self.set_attr("global_rank", -1)
+        self.dp_groups = set()
+        self.sharding_infos = []
+        self.varname_to_sharding_info = {}
+        self.partial_sharding = False
+        self.outer_dp_group = None
+
+    def _check_self(self):
+        if self.get_attr("dist_context") is None:
+            return False
+
+        if self.get_attr("stage") not in [1, 2, 3]:
+            return False
+        if (not isinstance(self.get_attr("sharding_degree"),
+                           int)) or self.get_attr("sharding_degree") <= 1:
+            return False
+        if len(self.get_attr("params_grads")) <= 0:
+            return False
+        if (not isinstance(self.get_attr("global_rank"),
+                           int)) or self.get_attr("global_rank") < 0:
+            return False
+
+        return True
+
+    def _check_conflict(self, other_pass):
+        return True
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        self._dist_context = self.get_attr("dist_context")
+        self.sharding_world_size = int(self.get_attr("sharding_degree"))
+        self.stage = int(self.get_attr("stage"))
+        self.global_rank = int(self.get_attr("global_rank"))
+        params_grads = self.get_attr("params_grads")
+        main_block, startup_block = main_program.global_block(
+        ), startup_program.global_block()
+
+        self._build_sharding_groups(main_block, params_grads)
+        self._shard_optimizer(main_block, startup_block, params_grads, context)
+        self._shard_gradient_synchronization(main_block)
+        self._shard_parameter(main_block, startup_block)
+
+    def _build_sharding_groups(self, main_block, params_grads):
+        self._collective_data_parallel_groups(main_block)
+        self._build_sharding_infos(params_grads)
+
+    def _collective_data_parallel_groups(self, main_block):
+        for op in main_block.ops:
+            if not _is_forward_op(op) or op.type in _skip_ops:
+                continue
+            group = _inference_data_parallel_group_for_operator(
+                self.global_rank, op, self._dist_context)
+            if group is not None:
+                self.dp_groups.add(group)
+
+        # TODO(JZ-LIANG) allow more than one dp groups in network, support more general distribution 
+        # genetated by auto search
+        if len(self.dp_groups) != 1:
+            raise NotImplementedError(
+                "So far Only and Exactly one data parallel group in network are supported, but got [{}] different data parallel groups".
+                format(len(self.dp_groups)))
+
+    def _build_sharding_infos(self, params_grads):
+
+        for dp_group in self.dp_groups:
+
+            assert dp_group.nranks >= self.sharding_world_size, "sharding world size [{}] should not larger than dp world size [{}]".format(
+                self.sharding_world_size, dp_group.nranks)
+            assert dp_group.nranks % self.sharding_world_size == 0, "sharding world size [{}] should be divisible by dp world size [{}]".format(
+                self.sharding_world_size, dp_group.nranks)
+            assert self.global_rank in dp_group.ranks, "current ranks [{}] does NOT belong to the data parallel group [{}]".format(
+                self.global_rank, dp_group.ranks)
+            assert len(
+                params_grads
+            ) >= self.sharding_world_size, "number of parameters [{}] is not enough to be shard among [{}] ranks".format(
+                len(params_grads), self.sharding_world_size)
+
+            # sharding hybrid data parallel: partial sharding param within 
+            if dp_group.nranks > self.sharding_world_size:
+                self.partial_sharding = True
+                assert len(
+                    self.dp_groups
+                ) == 1, "hybrid sharding and data parallelism are supported only when there is excatly one data parallel group in the network"
+                outer_dp_group, sharding_group = _get_dp_and_sharding_groups(
+                    dp_group.ranks, self.sharding_world_size, self.global_rank)
+                sharding_group = new_process_group(sharding_group)
+                self.outer_dp_group = new_process_group(outer_dp_group)
+            else:
+                sharding_group = dp_group
+
+            # TODO(JZ-LIANG) when support multiple dp groups in future, should group param and bind them to corresponding dp group
+            params_in_group = [p for p, g in params_grads]
+            assert len(params_in_group) == len(set(
+                params_in_group)), "found duplicated param in params_grads"
+            sharding_info = ShardingInfo(sharding_group, self.global_rank,
+                                         params_in_group)
+            self.sharding_infos.append(sharding_info)
+            for param in params_in_group:
+                self.varname_to_sharding_info[param.name] = sharding_info
+
+    def _shard_optimizer(self, main_block, startup_block, params_grads,
+                         pass_context):
+        """
+        sharding all optimizer related ops and vars, include:
+        gradient clip ops & vars
+        weight decay ops & vars
+        optimizer ops and states
+        """
+        self._shard_amp_related_op_and_vars(main_block, pass_context)
+        self._shard_weight_decay(main_block)
+        self._shard_gradient_clip(main_block)
+        self._shard_optimizer_ops_and_states(main_block, startup_block)
+        self._insert_optimizer_broadcasts(main_block, startup_block)
+
+    def _shard_amp_related_op_and_vars(self, main_block, pass_context):
+
+        if self.stage < 2:
+            return
+
+        for idx, op in reversed(list(enumerate(main_block.ops))):
+            # shard amp related param_grad cast
+            if _is_param_grad_fp32_cast_op(main_block, op):
+                output_name = op.output_arg_names[0]
+                param_name = output_name[:output_name.find("@")]
+                if not self._is_parameter_in_local_shard(param_name):
+                    main_block._remove_op(idx, sync=False)
+                    main_block._remove_var(output_name, sync=False)
+
+            # shard check nan inf
+            elif op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
+                reversed_x = []
+                for input_name in op.desc.input('X'):
+                    param_name = input_name[:input_name.find("@")]
+
+                    if self._is_parameter_in_local_shard(param_name):
+                        reversed_x.append(input_name)
+                op.desc.set_input('X', reversed_x)
+                op.desc.set_output('Out', reversed_x)
+
+        main_block._sync_with_cpp()
+
+    def _shard_gradient_clip(self, main_block):
+
+        if self.stage < 2:
+            return
+
+        # TODO (JZ-LIANG) support calculate global norm with tensor parallelism
+        removed_op_type = ['elementwise_mul', 'squared_l2_norm', 'clip_by_norm']
+        removed_op_idx = set()
+        removed_tmp_var = set()
+
+        for idx, op in list(enumerate(main_block.ops)):
+            if not _is_gradient_clip_op(op):
+                continue
+
+            if op.type in removed_op_type:
+                input_name = op.input("X")[0]
+                param_name = input_name[:input_name.find("@GRAD")]
+                if not self._is_parameter_in_local_shard(param_name):
+                    removed_op_idx.add(idx)
+                    if op.type in ['squared_l2_norm', 'clip_by_norm']:
+                        for output_name in op.output_arg_names:
+                            removed_tmp_var.add(output_name)
+
+        for idx, op in reversed(list(enumerate(main_block.ops))):
+            if not _is_gradient_clip_op(op):
+                continue
+            if idx in removed_op_idx:
+                main_block._remove_op(idx, sync=False)
+
+        for varname in removed_tmp_var:
+            main_block._remove_var(varname, sync=False)
+
+        for idx, op in list(enumerate(main_block.ops)):
+            if not _is_gradient_clip_op(op):
+                continue
+            if op.type == 'sum':
+                reserved_vars = []
+                for input_name in op.input_arg_names:
+                    if input_name not in removed_tmp_var:
+                        reserved_vars.append(input_name)
+                op.desc.set_input("X", reserved_vars)
+
+                sum_op_output = op.desc.output_arg_names()[0]
+                for i, sharding_info in enumerate(self.sharding_infos):
+                    new_op = main_block._insert_op(
+                        idx + i + 1,
+                        type='c_allreduce_sum',
+                        inputs={'X': [sum_op_output]},
+                        outputs={'Out': [sum_op_output]},
+                        attrs={
+                            'ring_id': sharding_info.group.id,
+                            'op_namescope': "/gradient_clip_model_parallelism",
+                            'use_calc_stream': True,
+                            OP_ROLE_KEY: OpRole.Optimize,
+                        })
+                    dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        main_block.var(sum_op_output))
+                    assert dist_attr is not None
+                    naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                        new_op, dist_attr.process_mesh, dist_attr.dims_mapping,
+                        self._dist_context)
+                break
+
+        main_block._sync_with_cpp()
+
+    def _shard_weight_decay(self, main_block):
+
+        if self.stage < 2:
+            return
+
+        for idx, op in reversed(list(enumerate(main_block.ops))):
+            if not _is_weight_decay_op(op):
+                continue
+            else:
+                raise NotImplementedError(
+                    "weight decay is NOT supported by now")
+        main_block._sync_with_cpp()
+
+    def _shard_optimizer_ops_and_states(self, main_block, startup_block):
+
+        should_removed_optimizer_states = []
+        for idx, op in reversed(list(enumerate(main_block.ops))):
+            if not is_optimizer_op(op):
+                break
+
+            if op.type in _supported_optimizer_type:
+                assert "Param" in op.input_names
+                assert len(op.input("Param")) == 1
+                param_name = op.input("Param")[0]
+                if not self._is_parameter_in_local_shard(param_name):
+                    should_removed_optimizer_states.extend([
+                        varname for varname in op.output_arg_names
+                        if varname != param_name
+                    ])
+                    main_block._remove_op(idx, sync=False)
+
+        for idx, op in reversed(list(enumerate(startup_block.ops))):
+            if len(op.output_arg_names) == 1 and op.output_arg_names[
+                    0] in should_removed_optimizer_states:
+                startup_block._remove_op(idx, sync=False)
+
+        for varname in should_removed_optimizer_states:
+            if main_block.has_var(varname):
+                main_block._remove_var(varname, sync=False)
+            if startup_block.has_var(varname):
+                startup_block._remove_var(varname, sync=False)
+
+        main_block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+    def _insert_optimizer_broadcasts(self, main_block, startup_block):
+
+        if self.stage > 2:
+            return
+
+        for sharding_info in self.sharding_infos:
+            for param in sharding_info.params:
+                assert main_block.has_var(param.name)
+                assert startup_block.has_var(param.name)
+
+                new_op = main_block.append_op(
+                    type='c_broadcast',
+                    inputs={'X': param},
+                    outputs={'Out': param},
+                    attrs={
+                        'ring_id': sharding_info.group.id,
+                        'root': sharding_info.get_var_rank(param.name),
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize
+                    })
+                param_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                    param)
+                assert param_dist_attr is not None
+                naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                    new_op, param_dist_attr.process_mesh,
+                    param_dist_attr.dims_mapping, self._dist_context)
+        main_block._sync_with_cpp()
+
+    def _is_parameter_in_local_shard(self, param_name):
+        assert param_name in self.varname_to_sharding_info
+        sharding_info = self.varname_to_sharding_info[param_name]
+        return sharding_info.is_in_local_shard(param_name)
+
+    def _shard_gradient_synchronization(self, main_block):
+
+        if self.stage < 2:
+            return
+
+        dp_ring_ids = [group.id for group in self.dp_groups]
+        for idx, op in reversed(list(enumerate(main_block.ops))):
+            if _is_param_grad_allreduce_op(op, main_block, dp_ring_ids):
+                input_name = op.input_arg_names[0]
+                base_name = _get_base_name_from_grad_name(input_name)
+                sharding_info = self.varname_to_sharding_info[base_name]
+                _insert_reduce_op(
+                    main_block, idx, input_name, sharding_info.group.id,
+                    sharding_info.get_var_rank(base_name), self._dist_context)
+                if not self.partial_sharding:
+                    main_block._remove_op(idx + 1, sync=False)
+                else:
+                    op._set_attr("ring_id", self.outer_dp_group.id)
+
+        main_block._sync_with_cpp()
+
+    def _shard_parameter(self, main_block, startup_block):
+
+        if self.stage < 3:
+            return
+
+        dp_ring_ids = [group.id for group in self.dp_groups]
+        for sharding_info in self.sharding_infos:
+            need_broadcast_vars, param_usage = sharding_info.get_broadcast_vars_and_param_usage(
+                main_block)
+            not_used_param_nane = []
+            for param_name in param_usage:
+                if param_usage[param_name] == 0 and sharding_info.get_var_rank(
+                        param_name) != sharding_info.local_rank:
+                    not_used_param_nane.append(param_name)
+
+            for idx, op in reversed(list(enumerate(main_block.ops))):
+                if is_optimizer_op(op):
+                    continue
+
+                for input_name in op.desc.input_arg_names():
+                    if op.type == "cast":
+                        continue
+                    if input_name not in need_broadcast_vars:
+                        continue
+                    root_rank = sharding_info.get_var_rank(input_name)
+                    if root_rank == sharding_info.local_rank:
+                        broadcast_varname = input_name
+                    else:
+                        broadcast_varname = unique_name.generate(input_name +
+                                                                 "@BroadCast")
+                        input_var = main_block.var(input_name)
+                        new_var = main_block.create_var(
+                            name=broadcast_varname,
+                            shape=input_var.shape,
+                            dtype=input_var.dtype,
+                            persistable=False)
+                        ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                            input_var)
+                        out_var_dist_attr = set_var_dist_attr(
+                            self._dist_context, new_var,
+                            ref_dist_attr.dims_mapping,
+                            ref_dist_attr.process_mesh)
+                        op._rename_input(input_name, broadcast_varname)
+
+                    _insert_init_and_broadcast_op(
+                        main_block, idx, broadcast_varname,
+                        sharding_info.local_rank, root_rank,
+                        sharding_info.group.id,
+                        op.attr('op_role'), self._dist_context)
+
+            for idx, op in reversed(list(enumerate(main_block.ops))):
+                if op.type != "cast":
+                    continue
+                input_name = op.input_arg_names[0]
+                output_name = op.output_arg_names[0]
+                if input_name in not_used_param_nane:
+                    main_block._remove_op(idx, sync=False)
+                    main_block._remove_var(output_name, sync=False)
+
+            for idx, op in reversed(list(enumerate(startup_block.ops))):
+                assert len(op.output_arg_names) == 1
+                output_name = op.output_arg_names[0]
+
+                if op.type == "c_broadcast" and op.attr(
+                        "ring_id") in dp_ring_ids:
+                    if self.outer_dp_group and sharding_info.get_var_rank(
+                            output_name) == sharding_info.local_rank:
+                        op._set_attr("ring_id", self.outer_dp_group.id)
+                    else:
+                        startup_block._remove_op(idx, sync=False)
+                    continue
+
+                if op.type != "c_broadcast" and output_name in param_usage and sharding_info.get_var_rank(
+                        output_name) != sharding_info.local_rank:
+                    startup_block._remove_op(idx, sync=False)
+
+            for param_name in param_usage:
+                if sharding_info.get_var_rank(
+                        param_name) != sharding_info.local_rank:
+                    main_block._remove_var(param_name, sync=False)
+                    startup_block._remove_var(param_name, sync=False)
+
+        main_block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+
+def _insert_init_and_broadcast_op(block, insert_idx, varname, local_rank,
+                                  root_rank, ring_id, op_role, dist_context):
+    """
+    empty op for initialization
+    """
+    broadcast_var = block.var(varname)
+    broadcast_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+        broadcast_var)
+
+    new_op = block._insert_op_without_sync(
+        insert_idx,
+        type='c_broadcast',
+        inputs={'X': varname},
+        outputs={'Out': varname},
+        attrs={
+            'ring_id': ring_id,
+            'root': root_rank,
+            'use_calc_stream': True,
+            OP_ROLE_KEY: op_role
+        })
+    naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+        new_op, broadcast_var_dist_attr.process_mesh,
+        broadcast_var_dist_attr.dims_mapping, dist_context)
+    if local_rank != root_rank:
+
+        new_op = block._insert_op_without_sync(
+            insert_idx,
+            type="empty",
+            outputs={"Out": broadcast_var.name},
+            attrs={
+                "shape": broadcast_var.shape,
+                "dtype": broadcast_var.dtype,
+                OP_ROLE_KEY: op_role
+            })
+        naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+            new_op, broadcast_var_dist_attr.process_mesh,
+            broadcast_var_dist_attr.dims_mapping, dist_context)
+    return
+
+
+def _insert_reduce_op(block,
+                      insert_idx,
+                      reduce_var,
+                      ring_id,
+                      root_id,
+                      dist_context,
+                      op_role=OpRole.Backward,
+                      use_calc_stream=True):
+    assert root_id >= 0, "root id should be a positive int, but now root id is {}".format(
+        root_id)
+    new_op = block._insert_op_without_sync(
+        insert_idx,
+        type='c_reduce_sum',
+        inputs={'X': [reduce_var]},
+        outputs={'Out': [reduce_var]},
+        attrs={
+            'ring_id': ring_id,
+            'root_id': root_id,
+            'use_calc_stream': use_calc_stream,
+            OP_ROLE_KEY: op_role
+        })
+
+    dist_attr = dist_context.get_tensor_dist_attr_for_program(
+        block.var(reduce_var))
+    naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+        new_op, dist_attr.process_mesh, dist_attr.dims_mapping, dist_context)
+
+
+def _get_dp_and_sharding_groups(origin_group, sharding_group_size, rank):
+    dp_axis = 0
+    sharding_axis = 1
+    shape = [len(origin_group) // sharding_group_size, sharding_group_size]
+
+    dp_group = _get_comm_group(origin_group, shape, dp_axis, rank)
+    sharding_group = _get_comm_group(origin_group, shape, sharding_axis, rank)
+
+    return dp_group, sharding_group
+
+
+def _is_gradient_clip_op(op):
+    return op.desc.has_attr("op_namescope") \
+        and op.desc.attr("op_namescope").startswith("/gradient_clip")
+
+
+def _is_weight_decay_op(op):
+    return op.desc.has_attr("op_namescope") \
+        and op.desc.attr("op_namescope").startswith("/regularization")
+
+
+def _is_param_grad_fp32_cast_op(block, op):
+    if not is_backward_op(op):
+        return False
+    if not _is_desired_cast_op(block, op, core.VarDesc.VarType.FP16,
+                               core.VarDesc.VarType.FP32):
+        return False
+    output_name = op.desc.output_arg_names()[0]
+    base_name = output_name[:output_name.find("@")]
+    if not block.has_var(base_name):
+        return False
+    return block.var(base_name).is_parameter
+
+
+def _is_param_fp16_cast_op(block, op, params):
+
+    if is_optimizer_op(op):
+        return False
+    if not _is_desired_cast_op(block, op):
+        return False
+    input_name = op.desc.input_arg_names()[0]
+    if input_name not in params:
+        return False
+    return True
+
+
+def _is_desired_cast_op(block,
+                        op,
+                        src_var_type=core.VarDesc.VarType.FP32,
+                        dst_var_type=core.VarDesc.VarType.FP16):
+    if op.type != "cast":
+        return False
+    assert (len(op.desc.input_arg_names()) == 1)
+    assert (len(op.desc.output_arg_names()) == 1)
+    input_var = block.var(op.desc.input_arg_names()[0])
+    output_var = block.var(op.desc.output_arg_names()[0])
+
+    if input_var.dtype != src_var_type or \
+        output_var.dtype != dst_var_type:
+        return False
+
+    return True
+
+
+def _get_base_name_from_grad_name(grad_name):
+    base_name = None
+    if ".cast_fp16@GRAD" in grad_name:
+        base_name = grad_name[:grad_name.find(".cast_fp16@GRAD")]
+    elif "@GRAD" in grad_name:
+        base_name = grad_name[:grad_name.find("@GRAD")]
+    return base_name
+
+
+def _is_param_grad_allreduce_op(op, block, dp_ring_ids):
+
+    if not is_backward_op(op):
+        return False
+    if op.type != "c_allreduce_sum":
+        return False
+    if op.attr('ring_id') not in dp_ring_ids:
+        return False
+
+    output_name = op.output_arg_names[0]
+    base_name = _get_base_name_from_grad_name(output_name)
+
+    if not block.has_var(base_name):
+        return False
+
+    return block.var(base_name).is_parameter
+
+
+def _is_forward_op(op):
+    return op.attr("op_role") == 0
+
+
+def _inference_data_parallel_group_for_operator(rank_id, op, dist_context):
+
+    dp_group = None
+    for input_name in op.input_arg_names:
+        if not is_parameter_related(input_name, op.block):
+            dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            process_mesh = dist_attr.process_mesh
+            input_dim_mapping = dist_attr.get_input_dims_mapping(input_name)
+            mesh_shape = process_mesh.topology
+            # TODO(JZ-LIANG) replace with specific batch size dimension
+            batch_size_axis = input_dim_mapping[0]
+            if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
+                group_ranks = _get_comm_group(process_mesh.processes,
+                                              process_mesh.topology,
+                                              batch_size_axis, rank_id)
+                dp_group = new_process_group(group_ranks)
+                break
+
+    return dp_group
+
+
+def shard_parameters(params, group_size):
+    # TODO(JZ-LIANG) support multiple partition methods
+    # method1: greedy even but unorder
+    # method2: roughly even with oreder
+    mapping = {}
+    for rank_ in range(group_size):
+        mapping[rank_] = []
+    sizes = [0] * group_size
+    for param in params:
+        rank = sizes.index(min(sizes))
+        mapping[rank].append(param)
+        numel = reduce(lambda x, y: x * y, param.shape)
+        assert numel > 0, "param [{}] should larger than 0, but it is [{}]".format(
+            param.name, numel)
+        sizes[rank] += numel
+
+    return mapping
+
+
+class ShardingInfo(object):
+    def __init__(self, group, rank, params):
+        self.group = group
+        self.params = params
+        self.param_names = [p.name for p in self.params]
+        self.group_size = group.nranks
+        self.global_rank = rank
+        self.local_rank = group.ranks.index(self.global_rank)
+        # rank in below mapping are local rank in this sharding group
+        self.rank_to_params = shard_parameters(self.params, self.group_size)
+        # include fp32 and fp16 param
+        self.param_to_rank = dict()
+        self._map_param_to_rank()
+
+    def _map_param_to_rank(self):
+        """
+        mapping parameters to the rank which holds it.
+        """
+        for rank, params in self.rank_to_params.items():
+            for param in params:
+                self.param_to_rank[param.name] = rank
+
+    def get_var_rank(self, varname):
+        if varname in self.param_to_rank:
+            return self.param_to_rank[varname]
+        return -1
+
+    def is_in_local_shard(self, param_name):
+        return self.get_var_rank(param_name) == self.local_rank
+
+    def get_broadcast_vars_and_param_usage(self, block):
+        broadcast_vars = set([])
+        fp16_params = set([])
+        fp16_to_fp32 = {}
+
+        param_usage = {x: 0 for x in self.param_names}
+        for op in block.ops:
+            if is_optimizer_op(op):
+                continue
+            for input_name in op.desc.input_arg_names():
+                if input_name in self.param_names:
+                    param_usage[input_name] += 1
+
+        for op in block.ops:
+            if not _is_param_fp16_cast_op(block, op, self.param_names):
+                continue
+            input_name = op.input_arg_names[0]
+            output_name = op.output_arg_names[0]
+            broadcast_vars.add(output_name)
+            fp16_params.add(output_name)
+            fp16_to_fp32[output_name] = input_name
+            param_usage[input_name] -= 1
+            self.param_to_rank[output_name] = self.param_to_rank[input_name]
+
+        for param, usage in param_usage.items():
+            if usage > 0:
+                broadcast_vars.add(param)
+        return broadcast_vars, param_usage
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 8fa06adba27ba..53f4a93f6480e 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -31,7 +31,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
-
+from paddle import _C_ops
 
 __all__ = [     #noqa
            'get_host_name_ip',
@@ -146,7 +146,7 @@ def global_scatter(x,
 
     ring_id = 0 if group is None else group.id
     if in_dygraph_mode():
-        return core.ops.global_scatter(x, local_count, \
+        return _C_ops.global_scatter(x, local_count, \
                                     global_count,  \
                                     'use_calc_stream', use_calc_stream, \
                                     'ring_id', ring_id)
@@ -258,7 +258,7 @@ def global_gather(x,
 
     ring_id = 0 if group is None else group.id
     if in_dygraph_mode():
-        return core.ops.global_gather(x, local_count, \
+        return _C_ops.global_gather(x, local_count, \
                                     global_count, \
                                     'use_calc_stream', use_calc_stream, \
                                     'ring_id', ring_id)
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
deleted file mode 100644
index cf198eab1e8e0..0000000000000
--- a/python/paddle/distribution.py
+++ /dev/null
@@ -1,968 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# TODO: define the distribution functions 
-# __all__ = ['Categorical',
-#            'MultivariateNormalDiag',
-#            'Normal',
-#            'sampling_id',
-#            'Uniform']
-
-from __future__ import print_function
-
-from .fluid.layers import control_flow
-from .fluid.layers import tensor
-from .fluid.layers import ops
-from .fluid.layers import nn
-from .fluid.layers import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub
-from .fluid import core
-from .fluid.framework import in_dygraph_mode
-from .tensor import arange, gather_nd, concat, multinomial
-import math
-import numpy as np
-import warnings
-
-from .fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from paddle import _C_ops
-
-__all__ = ['Distribution', 'Uniform', 'Normal', 'Categorical']
-
-
-class Distribution(object):
-    """
-    The abstract base class for probability distributions. Functions are 
-    implemented in specific distributions.
-    """
-
-    def __init__(self):
-        super(Distribution, self).__init__()
-
-    def sample(self):
-        """Sampling from the distribution."""
-        raise NotImplementedError
-
-    def entropy(self):
-        """The entropy of the distribution."""
-        raise NotImplementedError
-
-    def kl_divergence(self, other):
-        """The KL-divergence between self distributions and other."""
-        raise NotImplementedError
-
-    def log_prob(self, value):
-        """Log probability density/mass function."""
-        raise NotImplementedError
-
-    def probs(self, value):
-        """Probability density/mass function."""
-        raise NotImplementedError
-
-    def _validate_args(self, *args):
-        """
-        Argument validation for distribution args
-        Args:
-            value (float, list, numpy.ndarray, Tensor)
-        Raises
-            ValueError: if one argument is Tensor, all arguments should be Tensor
-        """
-        is_variable = False
-        is_number = False
-        for arg in args:
-            if isinstance(arg, tensor.Variable):
-                is_variable = True
-            else:
-                is_number = True
-
-        if is_variable and is_number:
-            raise ValueError(
-                'if one argument is Tensor, all arguments should be Tensor')
-
-        return is_variable
-
-    def _to_tensor(self, *args):
-        """
-        Argument convert args to Tensor
-
-        Args:
-            value (float, list, numpy.ndarray, Tensor)
-        Returns:
-            Tensor of args.
-        """
-        numpy_args = []
-        variable_args = []
-        tmp = 0.
-
-        for arg in args:
-            if isinstance(arg, float):
-                arg = [arg]
-            if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
-                raise TypeError(
-                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".
-                    format(type(arg)))
-
-            arg_np = np.array(arg)
-            arg_dtype = arg_np.dtype
-            if str(arg_dtype) != 'float32':
-                if str(arg_dtype) != 'float64':
-                    # "assign" op doesn't support float64. if dtype is float64, float32 variable will be generated
-                    #  and converted to float64 later using "cast".
-                    warnings.warn(
-                        "data type of argument only support float32 and float64, your argument will be convert to float32."
-                    )
-                arg_np = arg_np.astype('float32')
-            # tmp is used to support broadcast, it summarizes shapes of all the args and get the mixed shape.
-            tmp = tmp + arg_np
-            numpy_args.append(arg_np)
-
-        dtype = tmp.dtype
-        for arg in numpy_args:
-            arg_broadcasted, _ = np.broadcast_arrays(arg, tmp)
-            arg_variable = tensor.create_tensor(dtype=dtype)
-            tensor.assign(arg_broadcasted, arg_variable)
-            variable_args.append(arg_variable)
-
-        return tuple(variable_args)
-
-    def _check_values_dtype_in_probs(self, param, value):
-        """
-        Log_prob and probs methods have input ``value``, if value's dtype is different from param,
-        convert value's dtype to be consistent with param's dtype.
-
-        Args:
-            param (Tensor): low and high in Uniform class, loc and scale in Normal class.
-            value (Tensor): The input tensor.
-
-        Returns:
-            value (Tensor): Change value's dtype if value's dtype is different from param.
-        """
-        if in_dygraph_mode():
-            if value.dtype != param.dtype and convert_dtype(
-                    value.dtype) in ['float32', 'float64']:
-                warnings.warn(
-                    "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
-                )
-                return _C_ops.cast(value, 'in_dtype', value.dtype, 'out_dtype',
-                                   param.dtype)
-            return value
-
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
-        if value.dtype != param.dtype:
-            warnings.warn(
-                "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
-            )
-            return tensor.cast(value, dtype=param.dtype)
-        return value
-
-
-class Uniform(Distribution):
-    r"""Uniform distribution with `low` and `high` parameters.
-
-    Mathematical Details
-
-    The probability density function (pdf) is
-
-    .. math::
-
-        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
-
-    .. math::
-
-        Z = b - a
-
-    In the above equation:
-
-    * :math:`low = a`,
-    * :math:`high = b`,
-    * :math:`Z`: is the normalizing constant.
-
-    The parameters `low` and `high` must be shaped in a way that supports
-    [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
-
-    Args:
-        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
-        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
-        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          from paddle.distribution import Uniform
-
-          # Without broadcasting, a single uniform distribution [3, 4]:
-          u1 = Uniform(low=3.0, high=4.0)
-          # 2 distributions [1, 3], [2, 4]
-          u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
-          # 4 distributions
-          u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
-                    high=[[1.5, 2.5], [3.5, 4.5]])
-
-          # With broadcasting:
-          u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
-
-          # Complete example
-          value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-          uniform = Uniform([0.], [2.])
-
-          sample = uniform.sample([2])
-          # a random tensor created by uniform distribution with shape: [2, 1]
-          entropy = uniform.entropy()
-          # [0.6931472] with shape: [1]
-          lp = uniform.log_prob(value_tensor)
-          # [-0.6931472] with shape: [1]
-          p = uniform.probs(value_tensor)
-          # [0.5] with shape: [1]
-    """
-
-    def __init__(self, low, high, name=None):
-        if not in_dygraph_mode():
-            check_type(low, 'low',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Uniform')
-            check_type(high, 'high',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Uniform')
-
-        self.all_arg_is_float = False
-        self.batch_size_unknown = False
-        self.name = name if name is not None else 'Uniform'
-        self.dtype = 'float32'
-
-        if isinstance(low, int):
-            low = float(low)
-        if isinstance(high, int):
-            high = float(high)
-
-        if self._validate_args(low, high):
-            self.batch_size_unknown = True
-            self.low = low
-            self.high = high
-            self.dtype = convert_dtype(low.dtype)
-        else:
-            if isinstance(low, float) and isinstance(high, float):
-                self.all_arg_is_float = True
-            if isinstance(
-                    low,
-                    np.ndarray) and str(low.dtype) in ['float32', 'float64']:
-                self.dtype = low.dtype
-            elif isinstance(
-                    high,
-                    np.ndarray) and str(high.dtype) in ['float32', 'float64']:
-                self.dtype = high.dtype
-            self.low, self.high = self._to_tensor(low, high)
-            if self.dtype != convert_dtype(self.low.dtype):
-                self.low = tensor.cast(self.low, dtype=self.dtype)
-                self.high = tensor.cast(self.high, dtype=self.dtype)
-
-    def sample(self, shape, seed=0):
-        """Generate samples of the specified shape.
-
-        Args:
-          shape (list): 1D `int32`. Shape of the generated samples.
-          seed (int): Python integer number.
-
-        Returns:
-          Tensor: A tensor with prepended dimensions shape.The data type is float32.
-
-        """
-        if not in_dygraph_mode():
-            check_type(shape, 'shape', (list), 'sample')
-            check_type(seed, 'seed', (int), 'sample')
-
-        name = self.name + '_sample'
-        batch_shape = list((self.low + self.high).shape)
-        if self.batch_size_unknown:
-            output_shape = shape + batch_shape
-            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.dtype, 0.)
-            uniform_random_tmp = nn.uniform_random_batch_size_like(
-                zero_tmp,
-                zero_tmp.shape,
-                dtype=self.dtype,
-                min=0.,
-                max=1.,
-                seed=seed)
-            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
-            uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
-                                                    output_shape)
-            output = uniform_random_tmp_reshape * (
-                zero_tmp_reshape + self.high - self.low)
-            output = elementwise_add(output, self.low, name=name)
-            return output
-        else:
-            output_shape = shape + batch_shape
-            output = nn.uniform_random(
-                output_shape, dtype=self.dtype, min=0., max=1.,
-                seed=seed) * (tensor.zeros(
-                    output_shape, dtype=self.dtype) + (self.high - self.low))
-            output = elementwise_add(output, self.low, name=name)
-            if self.all_arg_is_float:
-                return nn.reshape(output, shape, name=name)
-            else:
-                return output
-
-    def log_prob(self, value):
-        """Log probability density/mass function.
-
-        Args:
-          value (Tensor): The input tensor.
-
-        Returns:
-          Tensor: log probability.The data type is same with value.
-
-        """
-        value = self._check_values_dtype_in_probs(self.low, value)
-        if in_dygraph_mode():
-            # ensure value in [low, high]
-            lb_bool = self.low < value
-            ub_bool = value < self.high
-
-            lb = _C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                             value.dtype)
-            ub = _C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                             value.dtype)
-            return nn.log(lb * ub) - nn.log(self.high - self.low)
-
-        name = self.name + '_log_prob'
-        lb_bool = self.low < value
-        ub_bool = value < self.high
-        lb = tensor.cast(lb_bool, dtype=value.dtype)
-        ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_sub(
-            nn.log(lb * ub), nn.log(self.high - self.low), name=name)
-
-    def probs(self, value):
-        """Probability density/mass function.
-
-        Args:
-          value (Tensor): The input tensor.
-
-        Returns:
-          Tensor: probability.The data type is same with value.
-
-        """
-        value = self._check_values_dtype_in_probs(self.low, value)
-        if in_dygraph_mode():
-            lb_bool = self.low < value
-            ub_bool = value < self.high
-
-            lb = _C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                             value.dtype)
-            ub = _C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                             value.dtype)
-            return (lb * ub) / (self.high - self.low)
-
-        name = self.name + '_probs'
-        lb_bool = self.low < value
-        ub_bool = value < self.high
-        lb = tensor.cast(lb_bool, dtype=value.dtype)
-        ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_div((lb * ub), (self.high - self.low), name=name)
-
-    def entropy(self):
-        r"""Shannon entropy in nats.
-
-        The entropy is
-
-        .. math::
-
-            entropy(low, high) = \\log (high - low)
-
-        Returns:
-          Tensor: Shannon entropy of uniform distribution.The data type is float32.
-
-        """
-        name = self.name + '_entropy'
-        return nn.log(self.high - self.low, name=name)
-
-
-class Normal(Distribution):
-    r"""The Normal distribution with location `loc` and `scale` parameters.
-
-    Mathematical details
-
-    The probability density function (pdf) is
-
-    .. math::
-
-        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
-
-    .. math::
-
-        Z = (2 \pi \sigma^2)^{0.5}
-
-    In the above equation:
-
-    * :math:`loc = \mu`: is the mean.
-    * :math:`scale = \sigma`: is the std.
-    * :math:`Z`: is the normalization constant.
-
-    Args:
-        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
-        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
-        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Examples:
-        .. code-block:: python
-          
-          import paddle
-          from paddle.distribution import Normal
-
-          # Define a single scalar Normal distribution.
-          dist = Normal(loc=0., scale=3.)
-          # Define a batch of two scalar valued Normals.
-          # The first has mean 1 and standard deviation 11, the second 2 and 22.
-          dist = Normal(loc=[1., 2.], scale=[11., 22.])
-          # Get 3 samples, returning a 3 x 2 tensor.
-          dist.sample([3])
-
-          # Define a batch of two scalar valued Normals.
-          # Both have mean 1, but different standard deviations.
-          dist = Normal(loc=1., scale=[11., 22.])
-
-          # Complete example
-          value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-          normal_a = Normal([0.], [1.])
-          normal_b = Normal([0.5], [2.])
-          sample = normal_a.sample([2])
-          # a random tensor created by normal distribution with shape: [2, 1]
-          entropy = normal_a.entropy()
-          # [1.4189385] with shape: [1]
-          lp = normal_a.log_prob(value_tensor)
-          # [-1.2389386] with shape: [1]
-          p = normal_a.probs(value_tensor)
-          # [0.28969154] with shape: [1]
-          kl = normal_a.kl_divergence(normal_b)
-          # [0.34939718] with shape: [1]
-    """
-
-    def __init__(self, loc, scale, name=None):
-        if not in_dygraph_mode():
-            check_type(loc, 'loc',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Normal')
-            check_type(scale, 'scale',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Normal')
-
-        self.batch_size_unknown = False
-        self.all_arg_is_float = False
-        self.name = name if name is not None else 'Normal'
-        self.dtype = 'float32'
-
-        if isinstance(loc, int):
-            loc = float(loc)
-        if isinstance(scale, int):
-            scale = float(scale)
-
-        if self._validate_args(loc, scale):
-            self.batch_size_unknown = True
-            self.loc = loc
-            self.scale = scale
-            self.dtype = convert_dtype(loc.dtype)
-        else:
-            if isinstance(loc, float) and isinstance(scale, float):
-                self.all_arg_is_float = True
-            if isinstance(
-                    loc,
-                    np.ndarray) and str(loc.dtype) in ['float32', 'float64']:
-                self.dtype = loc.dtype
-            elif isinstance(
-                    scale,
-                    np.ndarray) and str(scale.dtype) in ['float32', 'float64']:
-                self.dtype = scale.dtype
-            self.loc, self.scale = self._to_tensor(loc, scale)
-            if self.dtype != convert_dtype(self.loc.dtype):
-                self.loc = tensor.cast(self.loc, dtype=self.dtype)
-                self.scale = tensor.cast(self.scale, dtype=self.dtype)
-
-    def sample(self, shape, seed=0):
-        """Generate samples of the specified shape.
-
-        Args:
-          shape (list): 1D `int32`. Shape of the generated samples.
-          seed (int): Python integer number.
-
-        Returns:
-          Tensor: A tensor with prepended dimensions shape.The data type is float32.
-
-        """
-        if not in_dygraph_mode():
-            check_type(shape, 'shape', (list), 'sample')
-            check_type(seed, 'seed', (int), 'sample')
-
-        batch_shape = list((self.loc + self.scale).shape)
-        name = self.name + '_sample'
-
-        if self.batch_size_unknown:
-            output_shape = shape + batch_shape
-            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
-            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
-            zero_tmp_shape = nn.shape(zero_tmp_reshape)
-            normal_random_tmp = nn.gaussian_random(
-                zero_tmp_shape, mean=0., std=1., seed=seed, dtype=self.dtype)
-            output = normal_random_tmp * (zero_tmp_reshape + self.scale)
-            output = elementwise_add(output, self.loc, name=name)
-            return output
-        else:
-            output_shape = shape + batch_shape
-            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \
-                     (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
-            output = elementwise_add(output, self.loc, name=name)
-            if self.all_arg_is_float:
-                return nn.reshape(output, shape, name=name)
-            else:
-                return output
-
-    def entropy(self):
-        r"""Shannon entropy in nats.
-
-        The entropy is
-
-        .. math::
-
-            entropy(\sigma) = 0.5 \\log (2 \pi e \sigma^2)
-
-        In the above equation:
-
-        * :math:`scale = \sigma`: is the std.
-
-        Returns:
-          Tensor: Shannon entropy of normal distribution.The data type is float32.
-
-        """
-        name = self.name + '_entropy'
-        batch_shape = list((self.loc + self.scale).shape)
-        zero_tmp = tensor.fill_constant_batch_size_like(
-            self.loc + self.scale, batch_shape, self.dtype, 0.)
-        return elementwise_add(
-            0.5 + zero_tmp,
-            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
-            name=name)
-
-    def log_prob(self, value):
-        """Log probability density/mass function.
-
-        Args:
-          value (Tensor): The input tensor.
-
-        Returns:
-          Tensor: log probability.The data type is same with value.
-
-        """
-        name = self.name + '_log_prob'
-        value = self._check_values_dtype_in_probs(self.loc, value)
-
-        var = self.scale * self.scale
-        log_scale = nn.log(self.scale)
-        return elementwise_sub(
-            -1. * ((value - self.loc) * (value - self.loc)) / (2. * var),
-            log_scale + math.log(math.sqrt(2. * math.pi)),
-            name=name)
-
-    def probs(self, value):
-        """Probability density/mass function.
-
-        Args:
-          value (Tensor): The input tensor.
-
-        Returns:
-          Tensor: probability.The data type is same with value.
-
-        """
-        name = self.name + '_probs'
-        value = self._check_values_dtype_in_probs(self.loc, value)
-
-        var = self.scale * self.scale
-        return elementwise_div(
-            ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
-                    (2. * var)), (math.sqrt(2 * math.pi) * self.scale),
-            name=name)
-
-    def kl_divergence(self, other):
-        r"""The KL-divergence between two normal distributions.
-
-        The probability density function (pdf) is
-
-        .. math::
-
-            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\\frac{diff}{\sigma_1})^2 - 1 - 2 \\ln {ratio})
-
-        .. math::
-
-            ratio = \\frac{\sigma_0}{\sigma_1}
-        
-        .. math::
-
-            diff = \mu_1 - \mu_0
-
-        In the above equation:
-
-        * :math:`loc = \mu_0`: is the mean of current Normal distribution.
-        * :math:`scale = \sigma_0`: is the std of current Normal distribution.
-        * :math:`loc = \mu_1`: is the mean of other Normal distribution.
-        * :math:`scale = \sigma_1`: is the std of other Normal distribution.
-        * :math:`ratio`: is the ratio of scales.
-        * :math:`diff`: is the difference between means.
-
-        Args:
-            other (Normal): instance of Normal.
-
-        Returns:
-            Tensor: kl-divergence between two normal distributions.The data type is float32.
-
-        """
-        if not in_dygraph_mode():
-            check_type(other, 'other', Normal, 'kl_divergence')
-
-        name = self.name + '_kl_divergence'
-        var_ratio = self.scale / other.scale
-        var_ratio = (var_ratio * var_ratio)
-        t1 = (self.loc - other.loc) / other.scale
-        t1 = (t1 * t1)
-        return elementwise_add(
-            0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name)
-
-
-class Categorical(Distribution):
-    r"""
-    Categorical distribution is a discrete probability distribution that 
-    describes the possible results of a random variable that can take on 
-    one of K possible categories, with the probability of each category 
-    separately specified.
-
-    The probability mass function (pmf) is:
-
-    .. math::
-
-        pmf(k; p_i) = \prod_{i=1}^{k} p_i^{[x=i]}
-
-    In the above equation:
-
-    * :math:`[x=i]` : it evaluates to 1 if :math:`x==i` , 0 otherwise.
-
-    Args:
-        logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
-        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.distribution import Categorical
-
-            paddle.seed(100) # on CPU device
-            x = paddle.rand([6])
-            print(x)
-            # [0.5535528  0.20714243 0.01162981
-            #  0.51577556 0.36369765 0.2609165 ]
-
-            paddle.seed(200) # on CPU device
-            y = paddle.rand([6])
-            print(y)
-            # [0.77663314 0.90824795 0.15685187
-            #  0.04279523 0.34468332 0.7955718 ]
-
-            cat = Categorical(x)
-            cat2 = Categorical(y)
-
-            paddle.seed(1000) # on CPU device
-            cat.sample([2,3])
-            # [[0, 0, 5],
-            #  [3, 4, 5]]
-
-            cat.entropy()
-            # [1.77528]
-
-            cat.kl_divergence(cat2)
-            # [0.071952]
-
-            value = paddle.to_tensor([2,1,3])
-            cat.probs(value)
-            # [0.00608027 0.108298 0.269656]
-
-            cat.log_prob(value)
-            # [-5.10271 -2.22287 -1.31061]
-
-    """
-
-    def __init__(self, logits, name=None):
-        """
-        Args:
-            logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
-            name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-        """
-        if not in_dygraph_mode():
-            check_type(logits, 'logits',
-                       (np.ndarray, tensor.Variable, list, tuple),
-                       'Categorical')
-
-        self.name = name if name is not None else 'Categorical'
-        self.dtype = 'float32'
-
-        if self._validate_args(logits):
-            self.logits = logits
-            self.dtype = convert_dtype(logits.dtype)
-        else:
-            if isinstance(logits, np.ndarray) and str(
-                    logits.dtype) in ['float32', 'float64']:
-                self.dtype = logits.dtype
-            self.logits = self._to_tensor(logits)[0]
-            if self.dtype != convert_dtype(self.logits.dtype):
-                self.logits = tensor.cast(self.logits, dtype=self.dtype)
-
-    def sample(self, shape):
-        """Generate samples of the specified shape.
-
-        Args:
-            shape (list): Shape of the generated samples.
-
-        Returns:
-            Tensor: A tensor with prepended dimensions shape.
-        
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                from paddle.distribution import Categorical
-
-                paddle.seed(100) # on CPU device
-                x = paddle.rand([6])
-                print(x)
-                # [0.5535528  0.20714243 0.01162981
-                #  0.51577556 0.36369765 0.2609165 ]
-
-                cat = Categorical(x)
-
-                paddle.seed(1000) # on CPU device
-                cat.sample([2,3])
-                # [[0, 0, 5],
-                #  [3, 4, 5]]
-
-        """
-        name = self.name + '_sample'
-        if not in_dygraph_mode():
-            check_type(shape, 'shape', (list), 'sample')
-
-        num_samples = np.prod(np.array(shape))
-
-        logits_shape = list(self.logits.shape)
-        if len(logits_shape) > 1:
-            sample_shape = shape + logits_shape[:-1]
-            logits = nn.reshape(self.logits,
-                                [np.prod(logits_shape[:-1]), logits_shape[-1]])
-        else:
-            sample_shape = shape
-            logits = self.logits
-
-        sample_index = multinomial(logits, num_samples, True)
-        return nn.reshape(sample_index, sample_shape, name=name)
-
-    def kl_divergence(self, other):
-        """The KL-divergence between two Categorical distributions.
-
-        Args:
-            other (Categorical): instance of Categorical. The data type is float32.
-
-        Returns:
-            Tensor: kl-divergence between two Categorical distributions.
-        
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                from paddle.distribution import Categorical
-
-                paddle.seed(100) # on CPU device
-                x = paddle.rand([6])
-                print(x)
-                # [0.5535528  0.20714243 0.01162981
-                #  0.51577556 0.36369765 0.2609165 ]
-
-                paddle.seed(200) # on CPU device
-                y = paddle.rand([6])
-                print(y)
-                # [0.77663314 0.90824795 0.15685187
-                #  0.04279523 0.34468332 0.7955718 ]
-
-                cat = Categorical(x)
-                cat2 = Categorical(y)
-
-                cat.kl_divergence(cat2)
-                # [0.071952]
-
-        """
-        name = self.name + '_kl_divergence'
-        if not in_dygraph_mode():
-            check_type(other, 'other', Categorical, 'kl_divergence')
-
-        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
-        other_logits = other.logits - nn.reduce_max(
-            other.logits, dim=-1, keep_dim=True)
-        e_logits = ops.exp(logits)
-        other_e_logits = ops.exp(other_logits)
-        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
-        other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True)
-        prob = e_logits / z
-        kl = nn.reduce_sum(
-            prob * (logits - nn.log(z) - other_logits + nn.log(other_z)),
-            dim=-1,
-            keep_dim=True,
-            name=name)
-
-        return kl
-
-    def entropy(self):
-        """Shannon entropy in nats.
-
-        Returns:
-            Tensor: Shannon entropy of Categorical distribution. The data type is float32.
-        
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                from paddle.distribution import Categorical
-
-                paddle.seed(100) # on CPU device
-                x = paddle.rand([6])
-                print(x)
-                # [0.5535528  0.20714243 0.01162981
-                #  0.51577556 0.36369765 0.2609165 ]
-
-                cat = Categorical(x)
-
-                cat.entropy()
-                # [1.77528]
-
-        """
-        name = self.name + '_entropy'
-        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
-        e_logits = ops.exp(logits)
-        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
-        prob = e_logits / z
-
-        neg_entropy = nn.reduce_sum(
-            prob * (logits - nn.log(z)), dim=-1, keep_dim=True)
-        entropy = nn.scale(neg_entropy, scale=-1.0, name=name)
-        return entropy
-
-    def probs(self, value):
-        """Probabilities of the given category (``value``).
-
-        If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as 
-        category, and the others represents the different distributions.
-        At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the 
-        same number of distributions as ``logits``.
-        If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions
-        with ``logits. That is, ``value[:-1] = logits[:-1]``.
-
-        Args:
-            value (Tensor): The input tensor represents the selected category index.
-
-        Returns:
-            Tensor: probability according to the category index.
-        
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                from paddle.distribution import Categorical
-
-                paddle.seed(100) # on CPU device
-                x = paddle.rand([6])
-                print(x)
-                # [0.5535528  0.20714243 0.01162981
-                #  0.51577556 0.36369765 0.2609165 ]
-
-                cat = Categorical(x)
-
-                value = paddle.to_tensor([2,1,3])
-                cat.probs(value)
-                # [0.00608027 0.108298 0.269656]
-
-        """
-        name = self.name + '_probs'
-
-        dist_sum = nn.reduce_sum(self.logits, dim=-1, keep_dim=True)
-        prob = self.logits / dist_sum
-
-        shape = list(prob.shape)
-        value_shape = list(value.shape)
-        if len(shape) == 1:
-            num_value_in_one_dist = np.prod(value_shape)
-            index_value = nn.reshape(value, [num_value_in_one_dist, 1])
-            index = index_value
-        else:
-            num_dist = np.prod(shape[:-1])
-            num_value_in_one_dist = value_shape[-1]
-            prob = nn.reshape(prob, [num_dist, shape[-1]])
-            if len(value_shape) == 1:
-                value = nn.expand(value, [num_dist])
-                value_shape = shape[:-1] + value_shape
-            index_value = nn.reshape(value, [num_dist, -1, 1])
-            if shape[:-1] != value_shape[:-1]:
-                raise ValueError(
-                    "shape of value {} must match shape of logits {}".format(
-                        str(value_shape[:-1]), str(shape[:-1])))
-
-            index_prefix = nn.unsqueeze(
-                arange(
-                    num_dist, dtype=index_value.dtype), axes=-1)
-            index_prefix = nn.expand(index_prefix, [1, num_value_in_one_dist])
-            index_prefix = nn.unsqueeze(index_prefix, axes=-1)
-
-            if index_value.dtype != index_prefix.dtype:
-                tensor.cast(index_prefix, dtype=index_value.dtype)
-            index = concat([index_prefix, index_value], axis=-1)
-
-        # value is the category index to search for the corresponding probability.
-        select_prob = gather_nd(prob, index)
-        return nn.reshape(select_prob, value_shape, name=name)
-
-    def log_prob(self, value):
-        """Log probabilities of the given category. Refer to ``probs`` method.
-
-        Args:
-            value (Tensor): The input tensor represents the selected category index.
-
-        Returns:
-            Tensor: Log probability.
-        
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                from paddle.distribution import Categorical
-
-                paddle.seed(100) # on CPU device
-                x = paddle.rand([6])
-                print(x)
-                # [0.5535528  0.20714243 0.01162981
-                #  0.51577556 0.36369765 0.2609165 ]
-
-                cat = Categorical(x)
-
-                value = paddle.to_tensor([2,1,3])
-                cat.log_prob(value)
-                # [-5.10271 -2.22287 -1.31061]
-
-        """
-        name = self.name + '_log_prob'
-
-        return nn.log(self.probs(value), name=name)
diff --git a/python/paddle/fluid/eager/eager_tensor_patch_methods.py b/python/paddle/distribution/__init__.py
similarity index 50%
rename from python/paddle/fluid/eager/eager_tensor_patch_methods.py
rename to python/paddle/distribution/__init__.py
index 547a948da402b..6bc9048a1dd48 100644
--- a/python/paddle/fluid/eager/eager_tensor_patch_methods.py
+++ b/python/paddle/distribution/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .. import core as core
+from .beta import Beta
+from .categorical import Categorical
+from .dirichlet import Dirichlet
+from .distribution import Distribution
+from .exponential_family import ExponentialFamily
+from .kl import kl_divergence, register_kl
+from .normal import Normal
+from .uniform import Uniform
 
-
-def monkey_patch_eagertensor():
-    def __str__(self):
-        from paddle.tensor.to_string import eager_tensor_to_string
-        return eager_tensor_to_string(self)
-
-    if hasattr(core, "eager"):
-        setattr(core.eager.EagerTensor, "__str__", __str__)
+__all__ = [  # noqa
+    'Beta',
+    'Categorical',
+    'Dirichlet',
+    'Distribution',
+    'ExponentialFamily',
+    'Normal',
+    'Uniform',
+    'kl_divergence',
+    'register_kl'
+]
diff --git a/python/paddle/distribution/beta.py b/python/paddle/distribution/beta.py
new file mode 100644
index 0000000000000..057dff2866b91
--- /dev/null
+++ b/python/paddle/distribution/beta.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numbers
+
+import paddle
+
+from .dirichlet import Dirichlet
+from .exponential_family import ExponentialFamily
+
+
+class Beta(ExponentialFamily):
+    r"""
+    Beta distribution parameterized by alpha and beta
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        f(x; \alpha, \beta) = \frac{1}{B(\alpha, \beta)}x^{\alpha-1}(1-x)^{\beta-1}
+
+    where the normalization, B, is the beta function,
+
+    .. math::
+
+        B(\alpha, \beta) = \int_{0}^{1} t^{\alpha - 1} (1-t)^{\beta - 1}\mathrm{d}t 
+
+
+    Args:
+        alpha (float|Tensor): alpha parameter of beta distribution, positive(>0).
+        beta (float|Tensor): beta parameter of beta distribution, positive(>0).
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            # scale input
+            beta = paddle.distribution.Beta(alpha=0.5, beta=0.5)
+            print(beta.mean)
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [0.50000000])
+            print(beta.variance)
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [0.12500000])
+            print(beta.entropy())
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [0.12500000])
+
+            # tensor input with broadcast
+            beta = paddle.distribution.Beta(alpha=paddle.to_tensor([0.2, 0.4]), beta=0.6)
+            print(beta.mean)
+            # Tensor(shape=[2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [0.25000000, 0.40000001])
+            print(beta.variance)
+            # Tensor(shape=[2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [0.10416666, 0.12000000])
+            print(beta.entropy())
+            # Tensor(shape=[2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [-1.91923141, -0.38095069])
+    """
+
+    def __init__(self, alpha, beta):
+        if isinstance(alpha, numbers.Real):
+            alpha = paddle.full(shape=[1], fill_value=alpha)
+
+        if isinstance(beta, numbers.Real):
+            beta = paddle.full(shape=[1], fill_value=beta)
+
+        self.alpha, self.beta = paddle.broadcast_tensors([alpha, beta])
+
+        self._dirichlet = Dirichlet(paddle.stack([self.alpha, self.beta], -1))
+
+        super(Beta, self).__init__(self._dirichlet._batch_shape)
+
+    @property
+    def mean(self):
+        """mean of beta distribution.
+        """
+        return self.alpha / (self.alpha + self.beta)
+
+    @property
+    def variance(self):
+        """variance of beat distribution
+        """
+        sum = self.alpha + self.beta
+        return self.alpha * self.beta / (sum.pow(2) * (sum + 1))
+
+    def prob(self, value):
+        """probability density funciotn evaluated at value
+
+        Args:
+            value (Tensor): value to be evaluated.
+        
+        Returns:
+            Tensor: probability.
+        """
+        return paddle.exp(self.log_prob(value))
+
+    def log_prob(self, value):
+        """log probability density funciton evaluated at value
+
+        Args:
+            value (Tensor): value to be evaluated
+        
+        Returns:
+            Tensor: log probability.
+        """
+        return self._dirichlet.log_prob(paddle.stack([value, 1.0 - value], -1))
+
+    def sample(self, shape=()):
+        """sample from beta distribution with sample shape.
+
+        Args:
+            shape (Sequence[int], optional): sample shape.
+
+        Returns:
+            sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
+        """
+        shape = shape if isinstance(shape, tuple) else tuple(shape)
+        return paddle.squeeze(self._dirichlet.sample(shape)[..., 0])
+
+    def entropy(self):
+        """entropy of dirichlet distribution
+
+        Returns:
+            Tensor: entropy.
+        """
+        return self._dirichlet.entropy()
+
+    @property
+    def _natural_parameters(self):
+        return (self.alpha, self.beta)
+
+    def _log_normalizer(self, x, y):
+        return paddle.lgamma(x) + paddle.lgamma(y) - paddle.lgamma(x + y)
diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py
new file mode 100644
index 0000000000000..151e060e29b69
--- /dev/null
+++ b/python/paddle/distribution/categorical.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+
+import numpy as np
+from paddle import _C_ops
+
+from ..fluid import core
+from ..fluid.data_feeder import (check_dtype, check_type,
+                                 check_variable_and_dtype, convert_dtype)
+from ..fluid.framework import in_dygraph_mode
+from ..fluid.layers import (control_flow, elementwise_add, elementwise_div,
+                            elementwise_mul, elementwise_sub, nn, ops, tensor)
+from ..tensor import arange, concat, gather_nd, multinomial
+from .distribution import Distribution
+
+
+class Categorical(Distribution):
+    r"""
+    Categorical distribution is a discrete probability distribution that 
+    describes the possible results of a random variable that can take on 
+    one of K possible categories, with the probability of each category 
+    separately specified.
+
+    The probability mass function (pmf) is:
+
+    .. math::
+
+        pmf(k; p_i) = \prod_{i=1}^{k} p_i^{[x=i]}
+
+    In the above equation:
+
+    * :math:`[x=i]` : it evaluates to 1 if :math:`x==i` , 0 otherwise.
+
+    Args:
+        logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.distribution import Categorical
+
+            paddle.seed(100) # on CPU device
+            x = paddle.rand([6])
+            print(x)
+            # [0.5535528  0.20714243 0.01162981
+            #  0.51577556 0.36369765 0.2609165 ]
+
+            paddle.seed(200) # on CPU device
+            y = paddle.rand([6])
+            print(y)
+            # [0.77663314 0.90824795 0.15685187
+            #  0.04279523 0.34468332 0.7955718 ]
+
+            cat = Categorical(x)
+            cat2 = Categorical(y)
+
+            paddle.seed(1000) # on CPU device
+            cat.sample([2,3])
+            # [[0, 0, 5],
+            #  [3, 4, 5]]
+
+            cat.entropy()
+            # [1.77528]
+
+            cat.kl_divergence(cat2)
+            # [0.071952]
+
+            value = paddle.to_tensor([2,1,3])
+            cat.probs(value)
+            # [0.00608027 0.108298 0.269656]
+
+            cat.log_prob(value)
+            # [-5.10271 -2.22287 -1.31061]
+
+    """
+
+    def __init__(self, logits, name=None):
+        """
+        Args:
+            logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+            name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+        """
+        if not in_dygraph_mode():
+            check_type(logits, 'logits',
+                       (np.ndarray, tensor.Variable, list, tuple),
+                       'Categorical')
+
+        self.name = name if name is not None else 'Categorical'
+        self.dtype = 'float32'
+
+        if self._validate_args(logits):
+            self.logits = logits
+            self.dtype = convert_dtype(logits.dtype)
+        else:
+            if isinstance(logits, np.ndarray) and str(
+                    logits.dtype) in ['float32', 'float64']:
+                self.dtype = logits.dtype
+            self.logits = self._to_tensor(logits)[0]
+            if self.dtype != convert_dtype(self.logits.dtype):
+                self.logits = tensor.cast(self.logits, dtype=self.dtype)
+
+    def sample(self, shape):
+        """Generate samples of the specified shape.
+
+        Args:
+            shape (list): Shape of the generated samples.
+
+        Returns:
+            Tensor: A tensor with prepended dimensions shape.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Categorical
+
+                paddle.seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x)
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
+
+                cat = Categorical(x)
+
+                paddle.seed(1000) # on CPU device
+                cat.sample([2,3])
+                # [[0, 0, 5],
+                #  [3, 4, 5]]
+
+        """
+        name = self.name + '_sample'
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+
+        num_samples = np.prod(np.array(shape))
+
+        logits_shape = list(self.logits.shape)
+        if len(logits_shape) > 1:
+            sample_shape = shape + logits_shape[:-1]
+            logits = nn.reshape(self.logits,
+                                [np.prod(logits_shape[:-1]), logits_shape[-1]])
+        else:
+            sample_shape = shape
+            logits = self.logits
+
+        sample_index = multinomial(logits, num_samples, True)
+        return nn.reshape(sample_index, sample_shape, name=name)
+
+    def kl_divergence(self, other):
+        """The KL-divergence between two Categorical distributions.
+
+        Args:
+            other (Categorical): instance of Categorical. The data type is float32.
+
+        Returns:
+            Tensor: kl-divergence between two Categorical distributions.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Categorical
+
+                paddle.seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x)
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
+
+                paddle.seed(200) # on CPU device
+                y = paddle.rand([6])
+                print(y)
+                # [0.77663314 0.90824795 0.15685187
+                #  0.04279523 0.34468332 0.7955718 ]
+
+                cat = Categorical(x)
+                cat2 = Categorical(y)
+
+                cat.kl_divergence(cat2)
+                # [0.071952]
+
+        """
+        name = self.name + '_kl_divergence'
+        if not in_dygraph_mode():
+            check_type(other, 'other', Categorical, 'kl_divergence')
+
+        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
+        other_logits = other.logits - nn.reduce_max(
+            other.logits, dim=-1, keep_dim=True)
+        e_logits = ops.exp(logits)
+        other_e_logits = ops.exp(other_logits)
+        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
+        other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True)
+        prob = e_logits / z
+        kl = nn.reduce_sum(
+            prob * (logits - nn.log(z) - other_logits + nn.log(other_z)),
+            dim=-1,
+            keep_dim=True,
+            name=name)
+
+        return kl
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+            Tensor: Shannon entropy of Categorical distribution. The data type is float32.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Categorical
+
+                paddle.seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x)
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
+
+                cat = Categorical(x)
+
+                cat.entropy()
+                # [1.77528]
+
+        """
+        name = self.name + '_entropy'
+        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
+        e_logits = ops.exp(logits)
+        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
+        prob = e_logits / z
+
+        neg_entropy = nn.reduce_sum(
+            prob * (logits - nn.log(z)), dim=-1, keep_dim=True)
+        entropy = nn.scale(neg_entropy, scale=-1.0, name=name)
+        return entropy
+
+    def probs(self, value):
+        """Probabilities of the given category (``value``).
+
+        If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as 
+        category, and the others represents the different distributions.
+        At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the 
+        same number of distributions as ``logits``.
+        If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions
+        with ``logits. That is, ``value[:-1] = logits[:-1]``.
+
+        Args:
+            value (Tensor): The input tensor represents the selected category index.
+
+        Returns:
+            Tensor: probability according to the category index.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Categorical
+
+                paddle.seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x)
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
+
+                cat = Categorical(x)
+
+                value = paddle.to_tensor([2,1,3])
+                cat.probs(value)
+                # [0.00608027 0.108298 0.269656]
+
+        """
+        name = self.name + '_probs'
+
+        dist_sum = nn.reduce_sum(self.logits, dim=-1, keep_dim=True)
+        prob = self.logits / dist_sum
+
+        shape = list(prob.shape)
+        value_shape = list(value.shape)
+        if len(shape) == 1:
+            num_value_in_one_dist = np.prod(value_shape)
+            index_value = nn.reshape(value, [num_value_in_one_dist, 1])
+            index = index_value
+        else:
+            num_dist = np.prod(shape[:-1])
+            num_value_in_one_dist = value_shape[-1]
+            prob = nn.reshape(prob, [num_dist, shape[-1]])
+            if len(value_shape) == 1:
+                value = nn.expand(value, [num_dist])
+                value_shape = shape[:-1] + value_shape
+            index_value = nn.reshape(value, [num_dist, -1, 1])
+            if shape[:-1] != value_shape[:-1]:
+                raise ValueError(
+                    "shape of value {} must match shape of logits {}".format(
+                        str(value_shape[:-1]), str(shape[:-1])))
+
+            index_prefix = nn.unsqueeze(
+                arange(
+                    num_dist, dtype=index_value.dtype), axes=-1)
+            index_prefix = nn.expand(index_prefix, [1, num_value_in_one_dist])
+            index_prefix = nn.unsqueeze(index_prefix, axes=-1)
+
+            if index_value.dtype != index_prefix.dtype:
+                tensor.cast(index_prefix, dtype=index_value.dtype)
+            index = concat([index_prefix, index_value], axis=-1)
+
+        # value is the category index to search for the corresponding probability.
+        select_prob = gather_nd(prob, index)
+        return nn.reshape(select_prob, value_shape, name=name)
+
+    def log_prob(self, value):
+        """Log probabilities of the given category. Refer to ``probs`` method.
+
+        Args:
+            value (Tensor): The input tensor represents the selected category index.
+
+        Returns:
+            Tensor: Log probability.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Categorical
+
+                paddle.seed(100) # on CPU device
+                x = paddle.rand([6])
+                print(x)
+                # [0.5535528  0.20714243 0.01162981
+                #  0.51577556 0.36369765 0.2609165 ]
+
+                cat = Categorical(x)
+
+                value = paddle.to_tensor([2,1,3])
+                cat.log_prob(value)
+                # [-5.10271 -2.22287 -1.31061]
+
+        """
+        name = self.name + '_log_prob'
+
+        return nn.log(self.probs(value), name=name)
diff --git a/python/paddle/distribution/dirichlet.py b/python/paddle/distribution/dirichlet.py
new file mode 100644
index 0000000000000..2ef38a5a52d2e
--- /dev/null
+++ b/python/paddle/distribution/dirichlet.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from ..fluid.data_feeder import check_variable_and_dtype
+from ..fluid.framework import in_dygraph_mode
+from ..fluid.layer_helper import LayerHelper
+from .exponential_family import ExponentialFamily
+
+
+class Dirichlet(ExponentialFamily):
+    r"""
+    Dirichlet distribution with parameter concentration
+
+    The Dirichlet distribution is defined over the `(k-1)-simplex` using a 
+    positive, lenght-k vector concentration(`k > 1`).
+    The Dirichlet is identically the Beta distribution when `k = 2`.
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        f(x_1,...,x_k; \alpha_1,...,\alpha_k) = \frac{1}{B(\alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1} 
+
+    The normalizing constant is the multivariate beta function.
+
+    Args:
+        concentration (Tensor): concentration parameter of dirichlet 
+            distribution
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            dirichlet = paddle.distribution.Dirichlet(paddle.to_tensor([1., 2., 3.]))
+
+            print(dirichlet.entropy())
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [-1.24434423])
+            print(dirichlet.prob(paddle.to_tensor([.3, .5, .6])))
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [10.80000114])
+
+    """
+
+    def __init__(self, concentration):
+        if concentration.dim() < 1:
+            raise ValueError(
+                "`concentration` parameter must be at least one dimensional")
+
+        self.concentration = concentration
+        super(Dirichlet, self).__init__(concentration.shape[:-1],
+                                        concentration.shape[-1:])
+
+    @property
+    def mean(self):
+        """mean of Dirichelt distribution.
+
+        Returns:
+            mean value of distribution.
+        """
+        return self.concentration / self.concentration.sum(-1, keepdim=True)
+
+    @property
+    def variance(self):
+        """variance of Dirichlet distribution.
+
+        Returns:
+            variance value of distribution.
+        """
+        concentration0 = self.concentration.sum(-1, keepdim=True)
+        return (self.concentration * (concentration0 - self.concentration)) / (
+            concentration0.pow(2) * (concentration0 + 1))
+
+    def sample(self, shape=()):
+        """sample from dirichlet distribution.
+
+        Args:
+            shape (Sequence[int], optional): sample shape. Defaults to empty tuple.
+        """
+        shape = shape if isinstance(shape, tuple) else tuple(shape)
+        return _dirichlet(self.concentration.expand(self._extend_shape(shape)))
+
+    def prob(self, value):
+        """Probability density function(pdf) evaluated at value.
+
+        Args:
+            value (Tensor): value to be evaluated.
+
+        Returns:
+            pdf evaluated at value.
+        """
+        return paddle.exp(self.log_prob(value))
+
+    def log_prob(self, value):
+        """log of probability densitiy function.
+
+        Args:
+            value (Tensor): value to be evaluated.
+        """
+        return ((paddle.log(value) * (self.concentration - 1.0)
+                 ).sum(-1) + paddle.lgamma(self.concentration.sum(-1)) -
+                paddle.lgamma(self.concentration).sum(-1))
+
+    def entropy(self):
+        """entropy of Dirichlet distribution.
+
+        Returns:
+            entropy of distribution.
+        """
+        concentration0 = self.concentration.sum(-1)
+        k = self.concentration.shape[-1]
+        return (paddle.lgamma(self.concentration).sum(-1) -
+                paddle.lgamma(concentration0) -
+                (k - concentration0) * paddle.digamma(concentration0) - (
+                    (self.concentration - 1.0
+                     ) * paddle.digamma(self.concentration)).sum(-1))
+
+    @property
+    def _natural_parameters(self):
+        return (self.concentration, )
+
+    def _log_normalizer(self, x):
+        return x.lgamma().sum(-1) - paddle.lgamma(x.sum(-1))
+
+
+def _dirichlet(concentration, name=None):
+    op_type = 'dirichlet'
+
+    check_variable_and_dtype(concentration, 'concentration',
+                             ['float32', 'float64'], op_type)
+
+    if in_dygraph_mode():
+        return paddle._C_ops.dirichlet(concentration)
+
+    else:
+        helper = LayerHelper(op_type, **locals())
+        out = helper.create_variable_for_type_inference(
+            dtype=concentration.dtype)
+        helper.append_op(
+            type=op_type,
+            inputs={"Alpha": concentration},
+            outputs={'Out': out},
+            attrs={})
+        return out
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
new file mode 100644
index 0000000000000..9ba35cc4d3df5
--- /dev/null
+++ b/python/paddle/distribution/distribution.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define the distribution functions
+# __all__ = ['Categorical',
+#            'MultivariateNormalDiag',
+#            'Normal',
+#            'sampling_id',
+#            'Uniform']
+
+from __future__ import print_function
+
+import math
+import warnings
+
+import numpy as np
+from paddle import _C_ops
+
+from ..fluid import core
+from ..fluid.data_feeder import (check_dtype, check_type,
+                                 check_variable_and_dtype, convert_dtype)
+from ..fluid.framework import in_dygraph_mode
+from ..fluid.layers import (control_flow, elementwise_add, elementwise_div,
+                            elementwise_mul, elementwise_sub, nn, ops, tensor)
+from ..tensor import arange, concat, gather_nd, multinomial
+
+
+class Distribution(object):
+    """
+    The abstract base class for probability distributions. Functions are 
+    implemented in specific distributions.
+
+    Args:
+        batch_shape(Sequence[int], optional):  independent, not identically 
+            distributed draws, aka a "collection" or "bunch" of distributions.
+        event_shape(Sequence[int], optional): the shape of a single 
+            draw from the distribution; it may be dependent across dimensions. 
+            For scalar distributions, the event shape is []. For n-dimension 
+            multivariate distribution, the event shape is [n].
+    """
+
+    def __init__(self, batch_shape=(), event_shape=()):
+
+        self._batch_shape = batch_shape if isinstance(
+            batch_shape, tuple) else tuple(batch_shape)
+        self._event_shape = event_shape if isinstance(
+            event_shape, tuple) else tuple(event_shape)
+
+        super(Distribution, self).__init__()
+
+    @property
+    def batch_shape(self):
+        """Returns batch shape of distribution
+
+        Returns:
+            Sequence[int]: batch shape
+        """
+        return self._batch_shape
+
+    @property
+    def event_shape(self):
+        """Returns event shape of distribution
+
+        Returns:
+            Sequence[int]: event shape
+        """
+        return self._event_shape
+
+    def sample(self, shape=()):
+        """Sampling from the distribution."""
+        raise NotImplementedError
+
+    def entropy(self):
+        """The entropy of the distribution."""
+        raise NotImplementedError
+
+    def kl_divergence(self, other):
+        """The KL-divergence between self distributions and other."""
+        raise NotImplementedError
+
+    def prob(self, value):
+        """Probability density/mass function evaluated at value.
+
+        Args:
+            value (Tensor): value which will be evaluated
+        """
+        raise NotImplementedError
+
+    def log_prob(self, value):
+        """Log probability density/mass function."""
+        raise NotImplementedError
+
+    def probs(self, value):
+        """Probability density/mass function."""
+        raise NotImplementedError
+
+    def _extend_shape(self, sample_shape):
+        """compute shape of the sample 
+
+        Args:
+            sample_shape (Tensor): sample shape
+
+        Returns:
+            Tensor: generated sample data shape
+        """
+        return sample_shape + self._batch_shape + self._event_shape
+
+    def _validate_args(self, *args):
+        """
+        Argument validation for distribution args
+        Args:
+            value (float, list, numpy.ndarray, Tensor)
+        Raises
+            ValueError: if one argument is Tensor, all arguments should be Tensor
+        """
+        is_variable = False
+        is_number = False
+        for arg in args:
+            if isinstance(arg, tensor.Variable):
+                is_variable = True
+            else:
+                is_number = True
+
+        if is_variable and is_number:
+            raise ValueError(
+                'if one argument is Tensor, all arguments should be Tensor')
+
+        return is_variable
+
+    def _to_tensor(self, *args):
+        """
+        Argument convert args to Tensor
+
+        Args:
+            value (float, list, numpy.ndarray, Tensor)
+        Returns:
+            Tensor of args.
+        """
+        numpy_args = []
+        variable_args = []
+        tmp = 0.
+
+        for arg in args:
+            if isinstance(arg, float):
+                arg = [arg]
+            if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
+                raise TypeError(
+                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".
+                    format(type(arg)))
+
+            arg_np = np.array(arg)
+            arg_dtype = arg_np.dtype
+            if str(arg_dtype) != 'float32':
+                if str(arg_dtype) != 'float64':
+                    # "assign" op doesn't support float64. if dtype is float64, float32 variable will be generated
+                    #  and converted to float64 later using "cast".
+                    warnings.warn(
+                        "data type of argument only support float32 and float64, your argument will be convert to float32."
+                    )
+                arg_np = arg_np.astype('float32')
+            # tmp is used to support broadcast, it summarizes shapes of all the args and get the mixed shape.
+            tmp = tmp + arg_np
+            numpy_args.append(arg_np)
+
+        dtype = tmp.dtype
+        for arg in numpy_args:
+            arg_broadcasted, _ = np.broadcast_arrays(arg, tmp)
+            arg_variable = tensor.create_tensor(dtype=dtype)
+            tensor.assign(arg_broadcasted, arg_variable)
+            variable_args.append(arg_variable)
+
+        return tuple(variable_args)
+
+    def _check_values_dtype_in_probs(self, param, value):
+        """
+        Log_prob and probs methods have input ``value``, if value's dtype is different from param,
+        convert value's dtype to be consistent with param's dtype.
+
+        Args:
+            param (Tensor): low and high in Uniform class, loc and scale in Normal class.
+            value (Tensor): The input tensor.
+
+        Returns:
+            value (Tensor): Change value's dtype if value's dtype is different from param.
+        """
+        if in_dygraph_mode():
+            if value.dtype != param.dtype and convert_dtype(
+                    value.dtype) in ['float32', 'float64']:
+                warnings.warn(
+                    "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
+                )
+                return _C_ops.cast(value, 'in_dtype', value.dtype, 'out_dtype',
+                                   param.dtype)
+            return value
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+        if value.dtype != param.dtype:
+            warnings.warn(
+                "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
+            )
+            return tensor.cast(value, dtype=param.dtype)
+        return value
diff --git a/python/paddle/distribution/exponential_family.py b/python/paddle/distribution/exponential_family.py
new file mode 100644
index 0000000000000..0ce743efe85bd
--- /dev/null
+++ b/python/paddle/distribution/exponential_family.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from ..fluid.framework import in_dygraph_mode
+from .distribution import Distribution
+
+
+class ExponentialFamily(Distribution):
+    r""" 
+    ExponentialFamily is the base class for probability distributions belonging 
+    to exponential family, whose probability mass/density function has the 
+    form is defined below
+
+    ExponentialFamily is derived from `paddle.distribution.Distribution`.
+    
+    .. math::
+
+        f_{F}(x; \theta) = \exp(\langle t(x), \theta\rangle - F(\theta) + k(x))
+    
+    where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes 
+    the sufficient statistic, :math:`F(\theta)` is the log normalizer function 
+    for a given family and :math:`k(x)` is the carrier measure.
+    """
+
+    @property
+    def _natural_parameters(self):
+        raise NotImplementedError
+
+    def _log_normalizer(self):
+        raise NotImplementedError
+
+    @property
+    def _mean_carrier_measure(self):
+        raise NotImplementedError
+
+    def entropy(self):
+        """caculate entropy use `bregman divergence` 
+        https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf
+        """
+        entropy_value = -self._mean_carrier_measure
+
+        natural_parameters = []
+        for parameter in self._natural_parameters:
+            parameter = parameter.detach()
+            parameter.stop_gradient = False
+            natural_parameters.append(parameter)
+
+        log_norm = self._log_normalizer(*natural_parameters)
+
+        if in_dygraph_mode():
+            grads = paddle.grad(
+                log_norm.sum(), natural_parameters, create_graph=True)
+        else:
+            grads = paddle.static.gradients(log_norm.sum(), natural_parameters)
+
+        entropy_value += log_norm
+        for p, g in zip(natural_parameters, grads):
+            entropy_value -= p * g
+
+        return entropy_value
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
new file mode 100644
index 0000000000000..ff6a8cde456cb
--- /dev/null
+++ b/python/paddle/distribution/kl.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import warnings
+
+import paddle
+
+from ..fluid.framework import in_dygraph_mode
+from .beta import Beta
+from .categorical import Categorical
+from .dirichlet import Dirichlet
+from .distribution import Distribution
+from .exponential_family import ExponentialFamily
+from .normal import Normal
+from .uniform import Uniform
+
+__all__ = ["register_kl", "kl_divergence"]
+
+_REGISTER_TABLE = {}
+
+
+def kl_divergence(p, q):
+    r"""
+    Kullback-Leibler divergence between distribution p and q.
+
+    .. math::
+
+        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x 
+
+    Args:
+        p (Distribution): ``Distribution`` object.
+        q (Distribution): ``Distribution`` object.
+
+    Returns:
+        Tensor: batchwise KL-divergence between distribution p and q.
+
+    Raises:
+        NotImplementedError: can't find register function for KL(p||Q).
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            p = paddle.distribution.Beta(alpha=0.5, beta=0.5)
+            q = paddle.distribution.Beta(alpha=0.3, beta=0.7)
+
+            print(paddle.distribution.kl_divergence(p, q))
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [0.21193528])
+
+    """
+    return _dispatch(type(p), type(q))(p, q)
+
+
+def register_kl(cls_p, cls_q):
+    """Decorator for register a KL divergence implemention function.
+
+    Args:
+        cls_p(Distribution): subclass derived from ``Distribution``.
+        cls_q(Distribution): subclass derived from ``Distribution``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            @paddle.distribution.register_kl(paddle.distribution.Beta, paddle.distribution.Beta)
+            def kl_beta_beta():
+                pass # insert implementation here
+    """
+    if (not issubclass(cls_p, Distribution) or
+            not issubclass(cls_q, Distribution)):
+        raise TypeError('cls_p and cls_q must be subclass of Distribution')
+
+    def decorator(f):
+        _REGISTER_TABLE[cls_p, cls_q] = f
+        return f
+
+    return decorator
+
+
+def _dispatch(cls_p, cls_q):
+    """multiple dispatch into concrete implement function"""
+
+    # find all matched super class pair of p and q
+    matchs = [(super_p, super_q) for super_p, super_q in _REGISTER_TABLE
+              if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)]
+    if not matchs:
+        raise NotImplementedError
+
+    left_p, left_q = min(_Compare(*m) for m in matchs).classes
+    right_p, right_q = min(_Compare(*reversed(m)) for m in matchs).classes
+
+    if _REGISTER_TABLE[left_p, left_q] is not _REGISTER_TABLE[right_p, right_q]:
+        warnings.warn(
+            'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.
+            format(cls_p.__name__, cls_q.__name__, left_p.__name__,
+                   right_q.__name__), RuntimeWarning)
+
+    return _REGISTER_TABLE[left_p, left_q]
+
+
+@functools.total_ordering
+class _Compare(object):
+    def __init__(self, *classes):
+        self.classes = classes
+
+    def __eq__(self, other):
+        return self.classes == other.classes
+
+    def __le__(self, other):
+        for cls_x, cls_y in zip(self.classes, other.classes):
+            if not issubclass(cls_x, cls_y):
+                return False
+            if cls_x is not cls_y:
+                break
+        return True
+
+
+@register_kl(Beta, Beta)
+def _kl_beta_beta(p, q):
+    return ((q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma()) -
+            (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma()) +
+            ((p.alpha - q.alpha) * p.alpha.digamma()) + (
+                (p.beta - q.beta) * p.beta.digamma()) + (
+                    ((q.alpha + q.beta) -
+                     (p.alpha + p.beta)) * (p.alpha + p.beta).digamma()))
+
+
+@register_kl(Dirichlet, Dirichlet)
+def _kl_dirichlet_dirichlet(p, q):
+    return (
+        (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma()) -
+        ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)) + (
+            ((p.concentration - q.concentration) *
+             (p.concentration.digamma() -
+              p.concentration.sum(-1).digamma().unsqueeze(-1))).sum(-1)))
+
+
+@register_kl(Categorical, Categorical)
+def _kl_categorical_categorical(p, q):
+    return p.kl_divergence(q)
+
+
+@register_kl(Normal, Normal)
+def _kl_normal_normal(p, q):
+    return p.kl_divergence(q)
+
+
+@register_kl(Uniform, Uniform)
+def _kl_uniform_uniform(p, q):
+    return p.kl_divergence(q)
+
+
+@register_kl(ExponentialFamily, ExponentialFamily)
+def _kl_expfamily_expfamily(p, q):
+    """compute kl-divergence using `Bregman divergences` 
+    https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf
+    """
+    if not type(p) == type(q):
+        raise NotImplementedError
+
+    p_natural_params = []
+    for param in p._natural_parameters:
+        param = param.detach()
+        param.stop_gradient = False
+        p_natural_params.append(param)
+
+    q_natural_params = q._natural_parameters
+
+    p_log_norm = p._log_normalizer(*p_natural_params)
+
+    try:
+        if in_dygraph_mode():
+            p_grads = paddle.grad(
+                p_log_norm, p_natural_params, create_graph=True)
+        else:
+            p_grads = paddle.static.gradients(p_log_norm, p_natural_params)
+    except RuntimeError as e:
+        raise TypeError(
+            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q}).".
+            format(
+                cls_p=type(p).__name__, cls_q=type(q).__name__)) from e
+
+    kl = q._log_normalizer(*q_natural_params) - p_log_norm
+    for p_param, q_param, p_grad in zip(p_natural_params, q_natural_params,
+                                        p_grads):
+        term = (q_param - p_param) * p_grad
+        kl -= _sum_rightmost(term, len(q.event_shape))
+
+    return kl
+
+
+def _sum_rightmost(value, n):
+    """sum value along rightmost n dim"""
+    return value.sum(list(range(-n, 0))) if n > 0 else value
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
new file mode 100644
index 0000000000000..95d544577dfca
--- /dev/null
+++ b/python/paddle/distribution/normal.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+
+import numpy as np
+from paddle import _C_ops
+
+from ..fluid import core
+from ..fluid.data_feeder import (check_dtype, check_type,
+                                 check_variable_and_dtype, convert_dtype)
+from ..fluid.framework import in_dygraph_mode
+from ..fluid.layers import (control_flow, elementwise_add, elementwise_div,
+                            elementwise_mul, elementwise_sub, nn, ops, tensor)
+from ..tensor import arange, concat, gather_nd, multinomial
+from .distribution import Distribution
+
+
+class Normal(Distribution):
+    r"""The Normal distribution with location `loc` and `scale` parameters.
+
+    Mathematical details
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
+
+    .. math::
+
+        Z = (2 \pi \sigma^2)^{0.5}
+
+    In the above equation:
+
+    * :math:`loc = \mu`: is the mean.
+    * :math:`scale = \sigma`: is the std.
+    * :math:`Z`: is the normalization constant.
+
+    Args:
+        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+          
+          import paddle
+          from paddle.distribution import Normal
+
+          # Define a single scalar Normal distribution.
+          dist = Normal(loc=0., scale=3.)
+          # Define a batch of two scalar valued Normals.
+          # The first has mean 1 and standard deviation 11, the second 2 and 22.
+          dist = Normal(loc=[1., 2.], scale=[11., 22.])
+          # Get 3 samples, returning a 3 x 2 tensor.
+          dist.sample([3])
+
+          # Define a batch of two scalar valued Normals.
+          # Both have mean 1, but different standard deviations.
+          dist = Normal(loc=1., scale=[11., 22.])
+
+          # Complete example
+          value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+          normal_a = Normal([0.], [1.])
+          normal_b = Normal([0.5], [2.])
+          sample = normal_a.sample([2])
+          # a random tensor created by normal distribution with shape: [2, 1]
+          entropy = normal_a.entropy()
+          # [1.4189385] with shape: [1]
+          lp = normal_a.log_prob(value_tensor)
+          # [-1.2389386] with shape: [1]
+          p = normal_a.probs(value_tensor)
+          # [0.28969154] with shape: [1]
+          kl = normal_a.kl_divergence(normal_b)
+          # [0.34939718] with shape: [1]
+    """
+
+    def __init__(self, loc, scale, name=None):
+        if not in_dygraph_mode():
+            check_type(loc, 'loc',
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
+                       'Normal')
+            check_type(scale, 'scale',
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
+                       'Normal')
+
+        self.batch_size_unknown = False
+        self.all_arg_is_float = False
+        self.name = name if name is not None else 'Normal'
+        self.dtype = 'float32'
+
+        if isinstance(loc, int):
+            loc = float(loc)
+        if isinstance(scale, int):
+            scale = float(scale)
+
+        if self._validate_args(loc, scale):
+            self.batch_size_unknown = True
+            self.loc = loc
+            self.scale = scale
+            self.dtype = convert_dtype(loc.dtype)
+        else:
+            if isinstance(loc, float) and isinstance(scale, float):
+                self.all_arg_is_float = True
+            if isinstance(
+                    loc,
+                    np.ndarray) and str(loc.dtype) in ['float32', 'float64']:
+                self.dtype = loc.dtype
+            elif isinstance(
+                    scale,
+                    np.ndarray) and str(scale.dtype) in ['float32', 'float64']:
+                self.dtype = scale.dtype
+            # pylint: disable=unbalanced-tuple-unpacking
+            self.loc, self.scale = self._to_tensor(loc, scale)
+            if self.dtype != convert_dtype(self.loc.dtype):
+                self.loc = tensor.cast(self.loc, dtype=self.dtype)
+                self.scale = tensor.cast(self.scale, dtype=self.dtype)
+
+    def sample(self, shape, seed=0):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): 1D `int32`. Shape of the generated samples.
+          seed (int): Python integer number.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+            check_type(seed, 'seed', (int), 'sample')
+
+        batch_shape = list((self.loc + self.scale).shape)
+        name = self.name + '_sample'
+
+        if self.batch_size_unknown:
+            output_shape = shape + batch_shape
+            zero_tmp = tensor.fill_constant_batch_size_like(
+                self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
+            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
+            zero_tmp_shape = nn.shape(zero_tmp_reshape)
+            normal_random_tmp = nn.gaussian_random(
+                zero_tmp_shape, mean=0., std=1., seed=seed, dtype=self.dtype)
+            output = normal_random_tmp * (zero_tmp_reshape + self.scale)
+            output = elementwise_add(output, self.loc, name=name)
+            return output
+        else:
+            output_shape = shape + batch_shape
+            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \
+                     (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
+            output = elementwise_add(output, self.loc, name=name)
+            if self.all_arg_is_float:
+                return nn.reshape(output, shape, name=name)
+            else:
+                return output
+
+    def entropy(self):
+        r"""Shannon entropy in nats.
+
+        The entropy is
+
+        .. math::
+
+            entropy(\sigma) = 0.5 \\log (2 \pi e \sigma^2)
+
+        In the above equation:
+
+        * :math:`scale = \sigma`: is the std.
+
+        Returns:
+          Tensor: Shannon entropy of normal distribution.The data type is float32.
+
+        """
+        name = self.name + '_entropy'
+        batch_shape = list((self.loc + self.scale).shape)
+        zero_tmp = tensor.fill_constant_batch_size_like(
+            self.loc + self.scale, batch_shape, self.dtype, 0.)
+        return elementwise_add(
+            0.5 + zero_tmp,
+            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
+            name=name)
+
+    def log_prob(self, value):
+        """Log probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability.The data type is same with value.
+
+        """
+        name = self.name + '_log_prob'
+        value = self._check_values_dtype_in_probs(self.loc, value)
+
+        var = self.scale * self.scale
+        log_scale = nn.log(self.scale)
+        return elementwise_sub(
+            -1. * ((value - self.loc) * (value - self.loc)) / (2. * var),
+            log_scale + math.log(math.sqrt(2. * math.pi)),
+            name=name)
+
+    def probs(self, value):
+        """Probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: probability.The data type is same with value.
+
+        """
+        name = self.name + '_probs'
+        value = self._check_values_dtype_in_probs(self.loc, value)
+
+        var = self.scale * self.scale
+        return elementwise_div(
+            ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                    (2. * var)), (math.sqrt(2 * math.pi) * self.scale),
+            name=name)
+
+    def kl_divergence(self, other):
+        r"""The KL-divergence between two normal distributions.
+
+        The probability density function (pdf) is
+
+        .. math::
+
+            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\\frac{diff}{\sigma_1})^2 - 1 - 2 \\ln {ratio})
+
+        .. math::
+
+            ratio = \\frac{\sigma_0}{\sigma_1}
+        
+        .. math::
+
+            diff = \mu_1 - \mu_0
+
+        In the above equation:
+
+        * :math:`loc = \mu_0`: is the mean of current Normal distribution.
+        * :math:`scale = \sigma_0`: is the std of current Normal distribution.
+        * :math:`loc = \mu_1`: is the mean of other Normal distribution.
+        * :math:`scale = \sigma_1`: is the std of other Normal distribution.
+        * :math:`ratio`: is the ratio of scales.
+        * :math:`diff`: is the difference between means.
+
+        Args:
+            other (Normal): instance of Normal.
+
+        Returns:
+            Tensor: kl-divergence between two normal distributions.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(other, 'other', Normal, 'kl_divergence')
+
+        name = self.name + '_kl_divergence'
+        var_ratio = self.scale / other.scale
+        var_ratio = (var_ratio * var_ratio)
+        t1 = (self.loc - other.loc) / other.scale
+        t1 = (t1 * t1)
+        return elementwise_add(
+            0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name)
diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
new file mode 100644
index 0000000000000..516071dafe62f
--- /dev/null
+++ b/python/paddle/distribution/uniform.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+
+import numpy as np
+from paddle import _C_ops
+
+from ..fluid import core
+from ..fluid.data_feeder import (check_dtype, check_type,
+                                 check_variable_and_dtype, convert_dtype)
+from ..fluid.framework import in_dygraph_mode
+from ..fluid.layers import (control_flow, elementwise_add, elementwise_div,
+                            elementwise_mul, elementwise_sub, nn, ops, tensor)
+from ..tensor import arange, concat, gather_nd, multinomial
+from .distribution import Distribution
+
+
+class Uniform(Distribution):
+    r"""Uniform distribution with `low` and `high` parameters.
+
+    Mathematical Details
+
+    The probability density function (pdf) is
+
+    .. math::
+
+        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
+
+    .. math::
+
+        Z = b - a
+
+    In the above equation:
+
+    * :math:`low = a`,
+    * :math:`high = b`,
+    * :math:`Z`: is the normalizing constant.
+
+    The parameters `low` and `high` must be shaped in a way that supports
+    [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
+
+    Args:
+        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Uniform
+
+          # Without broadcasting, a single uniform distribution [3, 4]:
+          u1 = Uniform(low=3.0, high=4.0)
+          # 2 distributions [1, 3], [2, 4]
+          u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
+          # 4 distributions
+          u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
+                    high=[[1.5, 2.5], [3.5, 4.5]])
+
+          # With broadcasting:
+          u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
+
+          # Complete example
+          value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+          uniform = Uniform([0.], [2.])
+
+          sample = uniform.sample([2])
+          # a random tensor created by uniform distribution with shape: [2, 1]
+          entropy = uniform.entropy()
+          # [0.6931472] with shape: [1]
+          lp = uniform.log_prob(value_tensor)
+          # [-0.6931472] with shape: [1]
+          p = uniform.probs(value_tensor)
+          # [0.5] with shape: [1]
+    """
+
+    def __init__(self, low, high, name=None):
+        if not in_dygraph_mode():
+            check_type(low, 'low',
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
+                       'Uniform')
+            check_type(high, 'high',
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
+                       'Uniform')
+
+        self.all_arg_is_float = False
+        self.batch_size_unknown = False
+        self.name = name if name is not None else 'Uniform'
+        self.dtype = 'float32'
+
+        if isinstance(low, int):
+            low = float(low)
+        if isinstance(high, int):
+            high = float(high)
+
+        if self._validate_args(low, high):
+            self.batch_size_unknown = True
+            self.low = low
+            self.high = high
+            self.dtype = convert_dtype(low.dtype)
+        else:
+            if isinstance(low, float) and isinstance(high, float):
+                self.all_arg_is_float = True
+            if isinstance(
+                    low,
+                    np.ndarray) and str(low.dtype) in ['float32', 'float64']:
+                self.dtype = low.dtype
+            elif isinstance(
+                    high,
+                    np.ndarray) and str(high.dtype) in ['float32', 'float64']:
+                self.dtype = high.dtype
+            # pylint: disable=unbalanced-tuple-unpacking
+            self.low, self.high = self._to_tensor(low, high)
+            if self.dtype != convert_dtype(self.low.dtype):
+                self.low = tensor.cast(self.low, dtype=self.dtype)
+                self.high = tensor.cast(self.high, dtype=self.dtype)
+
+    def sample(self, shape, seed=0):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): 1D `int32`. Shape of the generated samples.
+          seed (int): Python integer number.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+
+        """
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+            check_type(seed, 'seed', (int), 'sample')
+
+        name = self.name + '_sample'
+        batch_shape = list((self.low + self.high).shape)
+        if self.batch_size_unknown:
+            output_shape = shape + batch_shape
+            zero_tmp = tensor.fill_constant_batch_size_like(
+                self.low + self.high, batch_shape + shape, self.dtype, 0.)
+            uniform_random_tmp = nn.uniform_random_batch_size_like(
+                zero_tmp,
+                zero_tmp.shape,
+                dtype=self.dtype,
+                min=0.,
+                max=1.,
+                seed=seed)
+            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
+            uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
+                                                    output_shape)
+            output = uniform_random_tmp_reshape * (
+                zero_tmp_reshape + self.high - self.low)
+            output = elementwise_add(output, self.low, name=name)
+            return output
+        else:
+            output_shape = shape + batch_shape
+            output = nn.uniform_random(
+                output_shape, dtype=self.dtype, min=0., max=1.,
+                seed=seed) * (tensor.zeros(
+                    output_shape, dtype=self.dtype) + (self.high - self.low))
+            output = elementwise_add(output, self.low, name=name)
+            if self.all_arg_is_float:
+                return nn.reshape(output, shape, name=name)
+            else:
+                return output
+
+    def log_prob(self, value):
+        """Log probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: log probability.The data type is same with value.
+
+        """
+        value = self._check_values_dtype_in_probs(self.low, value)
+        if in_dygraph_mode():
+            # ensure value in [low, high]
+            lb_bool = self.low < value
+            ub_bool = value < self.high
+
+            lb = _C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
+                             value.dtype)
+            ub = _C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
+                             value.dtype)
+            return nn.log(lb * ub) - nn.log(self.high - self.low)
+
+        name = self.name + '_log_prob'
+        lb_bool = self.low < value
+        ub_bool = value < self.high
+        lb = tensor.cast(lb_bool, dtype=value.dtype)
+        ub = tensor.cast(ub_bool, dtype=value.dtype)
+        return elementwise_sub(
+            nn.log(lb * ub), nn.log(self.high - self.low), name=name)
+
+    def probs(self, value):
+        """Probability density/mass function.
+
+        Args:
+          value (Tensor): The input tensor.
+
+        Returns:
+          Tensor: probability.The data type is same with value.
+
+        """
+        value = self._check_values_dtype_in_probs(self.low, value)
+        if in_dygraph_mode():
+            lb_bool = self.low < value
+            ub_bool = value < self.high
+
+            lb = _C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
+                             value.dtype)
+            ub = _C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
+                             value.dtype)
+            return (lb * ub) / (self.high - self.low)
+
+        name = self.name + '_probs'
+        lb_bool = self.low < value
+        ub_bool = value < self.high
+        lb = tensor.cast(lb_bool, dtype=value.dtype)
+        ub = tensor.cast(ub_bool, dtype=value.dtype)
+        return elementwise_div((lb * ub), (self.high - self.low), name=name)
+
+    def entropy(self):
+        r"""Shannon entropy in nats.
+
+        The entropy is
+
+        .. math::
+
+            entropy(low, high) = \\log (high - low)
+
+        Returns:
+          Tensor: Shannon entropy of uniform distribution.The data type is float32.
+
+        """
+        name = self.name + '_entropy'
+        return nn.log(self.high - self.low, name=name)
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index d8ee875e768e5..ec589b40e907f 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -55,7 +55,6 @@
 from .initializer import set_global_initializer
 from . import layers
 from . import dygraph
-from . import eager
 from . import contrib
 from . import nets
 from . import optimizer
@@ -71,7 +70,7 @@
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
-from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace
 from .incubate import fleet
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
@@ -91,7 +90,6 @@
 from .io import save, load, load_program_state, set_program_state
 from .dygraph.checkpoint import save_dygraph, load_dygraph
 from .dygraph.varbase_patch_methods import monkey_patch_varbase
-from .eager.eager_tensor_patch_methods import monkey_patch_eagertensor
 from . import generator
 from .core import _cuda_synchronize
 from .generator import Generator
@@ -115,7 +113,6 @@
         'contrib',
         'data',
         'dygraph',
-        'eager',
         'enable_dygraph',
         'disable_dygraph',
         'enable_imperative',
@@ -133,6 +130,7 @@
         'CUDAPinnedPlace',
         'NPUPlace',
         'IPUPlace',
+        'MLUPlace',
         'Tensor',
         'ParamAttr',
         'WeightNormParamAttr',
@@ -220,7 +218,6 @@ def remove_flag_if_exists(name):
 monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
-monkey_patch_eagertensor()
 
 # NOTE(zhiqiu): register npu_finalize on the exit of Python,
 # do some clean up manually.
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 9ea407c760f07..4805994b7aa04 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -202,7 +202,7 @@ def modify_forward_desc_for_recompute(self):
             if op.desc.has_attr(op_device_attr_name):
                 op_device = op.desc.attr(op_device_attr_name)
 
-            # Setting the force_cpu of seed to true will make the output of seed in cpu memory, 
+            # Setting the force_cpu of seed to true will make the output of seed in cpu memory,
             # reduce the synchronous copy from GPU to CPU in dropout, and reduce the communication hang
             added_op = self.block._insert_op(
                 index=op.idx,
@@ -957,7 +957,7 @@ def _append_backward_ops_with_checkpoints_(
         # added_descs should be in grad_op_descs because it is backward op desc
         grad_op_descs.extend(buffer_descs)
 
-        # 3.c. add backward ops for all ops in current segment 
+        # 3.c. add backward ops for all ops in current segment
         for op_desc in reversed(added_descs):
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op_desc, cpt.to_text(no_grad_dict[block.idx]), [])
@@ -1109,10 +1109,11 @@ def _append_backward_ops_(block,
         # Getting op's corresponding grad_op
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
+        # Build the mapping between the forward op and bacckward op (Only for auto parallel)
         if distop_context is not None:
             for op_desc in grad_op_desc:
-                assert op_desc.id() not in distop_context.gradopidx2opidx
-                distop_context.gradopidx2opidx[op_desc.id()] = op.desc.id()
+                assert op_desc.id() not in distop_context.grad_op_id_to_op_id
+                distop_context.grad_op_id_to_op_id[op_desc.id()] = op.desc.id()
 
         # Set device for grad_op according to forward Op
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
@@ -1197,6 +1198,12 @@ def _append_backward_ops_(block,
     for op_desc in grad_op_descs:
         new_op_desc = target_block.desc.append_op()
         new_op_desc.copy_from(op_desc)
+        # Rebuild the mapping because new_op_desc has a differnt id (Only for auto parallel)
+        if distop_context is not None:
+            if op_desc.id() in distop_context.grad_op_id_to_op_id:
+                distop_context.grad_op_id_to_op_id[new_op_desc.id(
+                )] = distop_context.grad_op_id_to_op_id[op_desc.id()]
+                distop_context.grad_op_id_to_op_id.pop(op_desc.id())
         new_op_desc._set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index a4187d4a143d3..082a72af7997f 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -29,6 +29,7 @@
 from .framework import in_dygraph_mode
 from .layer_helper import LayerHelper
 from .framework import default_main_program
+from paddle import _C_ops
 
 __all__ = [
     'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
@@ -47,7 +48,7 @@ def _squared_l2_norm(x):
         return sum_square
 
     if in_dygraph_mode():
-        return core.ops.squared_l2_norm(x)
+        return _C_ops.squared_l2_norm(x)
 
     op_type = 'squared_l2_norm'
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
@@ -478,29 +479,30 @@ def _dygraph_clip(self, params_grads):
         sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
         global_norm_var = []
         if len(sum_square_list_fp16) > 0:
-            global_norm_var_fp16 = layers.concat(sum_square_list_fp16)
-            global_norm_var_fp16 = layers.reduce_sum(global_norm_var_fp16)
+            global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16)
             global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
         if len(sum_square_list_fp32) > 0:
-            global_norm_var_fp32 = layers.concat(sum_square_list_fp32)
-            global_norm_var_fp32 = layers.reduce_sum(global_norm_var_fp32)
+            global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32)
             if sum_dtype == 'float32':
                 global_norm_var.append(global_norm_var_fp32)
             else:
                 global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
         if len(sum_square_list) > 0:
-            global_norm_var_fp64 = layers.concat(sum_square_list)
-            global_norm_var_fp64 = layers.reduce_sum(global_norm_var_fp64)
+            global_norm_var_fp64 = paddle.add_n(sum_square_list)
             global_norm_var.append(global_norm_var_fp64)
-        global_norm_var = layers.concat(global_norm_var)
-        global_norm_var = layers.reduce_sum(global_norm_var)
+        global_norm_var = paddle.add_n(global_norm_var)
         global_norm_var = layers.sqrt(global_norm_var)
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
-        clip_var = layers.elementwise_div(
-            x=max_global_norm,
-            y=layers.elementwise_max(
-                x=global_norm_var, y=max_global_norm))
+
+        # only when global_norm_var > max_global_norm, grad need clip
+        need_clip = False
+        if global_norm_var > max_global_norm:
+            need_clip = True
+
+        if need_clip:
+            clip_var = layers.elementwise_div(
+                x=max_global_norm, y=global_norm_var)
         for p, g in params_grads:
             if g is None:
                 continue
@@ -508,10 +510,14 @@ def _dygraph_clip(self, params_grads):
                 params_and_grads.append((p, g))
                 continue
             # TODO(wangxi): use inplace elementwise_mul
-            clip_input = (clip_var.astype('float16')
-                          if g.dtype == core.VarDesc.VarType.FP16 else clip_var)
-            new_grad = layers.elementwise_mul(x=g, y=clip_input)
-            params_and_grads.append((p, new_grad))
+            if need_clip:
+                clip_input = (clip_var.astype('float16')
+                              if g.dtype == core.VarDesc.VarType.FP16 else
+                              clip_var)
+                new_grad = layers.elementwise_mul(x=g, y=clip_input)
+                params_and_grads.append((p, new_grad))
+            else:
+                params_and_grads.append((p, g))
 
         return params_and_grads
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index c466cafe1ff3c..80d2ccb0d5ca6 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -18,7 +18,9 @@
 __all__ = ["CustomOpLists", "AutoMixedPrecisionLists"]
 
 # lookup_table fp16 is slower than fp32, though fp16 is supported.
-_extra_unsupported_fp16_list = {'lookup_table', 'lookup_table_v2'}
+_extra_unsupported_fp16_list = {
+    'lookup_table', 'lookup_table_v2', 'scatter', 'scatter_grad'
+}
 
 
 class AutoMixedPrecisionLists(object):
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 24caf1479543e..a3fdca5e40669 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -506,7 +506,7 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
             main_program=infer_program.clone(),
             model_filename=model_filename,
             params_filename=params_filename,
-            clip_extra=True)
+            clip_extra=False)
 
         if is_dynamic_mode:
             paddle.disable_static()
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index e9173a86b89fa..9da798375af25 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -17,6 +17,7 @@
 import logging
 import numpy as np
 import shutil
+from inspect import isgeneratorfunction
 from .... import io
 from .... import core
 from .... import framework
@@ -136,6 +137,7 @@ def __init__(self,
                  params_filename=None,
                  batch_generator=None,
                  sample_generator=None,
+                 data_loader=None,
                  batch_size=10,
                  batch_nums=None,
                  algo="KL",
@@ -175,6 +177,9 @@ def __init__(self,
                 calibrate data for DataLoader, and it only returns a sample every
                 time. Note that, sample_generator and batch_generator, only one
                 should be set. Beisdes, sample_generator dose not support lod tensor.
+            data_loader(Python Generator, Paddle.io.DataLoader, optional): The
+                Generator or Dataloader provides calibrate data, and it could
+                return a batch every time.
             batch_size(int, optional): The batch size of DataLoader. Default is 10.
             batch_nums(int, optional): If batch_nums is not None, the number of 
                 calibrate data is batch_size*batch_nums. If batch_nums is None, use 
@@ -279,8 +284,11 @@ def __init__(self,
         assert executor is not None, "The executor cannot be None."
         assert model_dir is not None, "The model_dir cannot be None."
         assert any([gen is not None] for gen in [sample_generator,
-            batch_generator]), "The sample_generator and batch_generator " \
-            "cannot be None in the same time."
+            batch_generator, data_loader]), "The sample_generator, batch_generator " \
+            "and data_loader cannot be None in the same time."
+        if data_loader is not None:
+            assert isinstance(data_loader, (io.DataLoader, type(isgeneratorfunction))), \
+                "data_loader only accepts `paddle.io.DataLoader` or Generator instance."
         assert batch_size > 0, "The batch_size should be greater than 0."
         assert algo in self._support_algo_type, \
             "The algo should be KL, hist, mse, avg, abs_max or min_max."
@@ -323,7 +331,7 @@ def __init__(self,
         self._program = None
         self._feed_list = None
         self._fetch_list = None
-        self._data_loader = None
+        self._data_loader = data_loader
 
         self._out_scale_op_list = _out_scale_op_list
         self._quantized_weight_var_name = set()
@@ -473,6 +481,9 @@ def _load_model_data(self):
 
         feed_vars = [framework._get_var(str(var_name), self._program) \
             for var_name in self._feed_list]
+
+        if self._data_loader is not None:
+            return
         self._data_loader = io.DataLoader.from_generator(
             feed_list=feed_vars, capacity=3 * self._batch_size, iterable=True)
         if self._sample_generator is not None:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 3e5db06a86a37..7dbd927874d19 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -63,7 +63,8 @@ def __init__(self,
         self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set(
             [-1])
         self._scale_immutable_ops = [
-            'transpose2', 'reshape2', 'pool2d', 'slice'
+            'transpose2', 'reshape2', 'pool2d', 'slice', 'nearest_interp',
+            'nearest_interp_v2'
         ]
         self._scale_ops = ['scale']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
index 3c3dfd08fccfa..642bcf2a47679 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
@@ -115,19 +115,30 @@ def generate_quantized_model(self,
                                  is_use_cache_file=False,
                                  is_optimize_model=False,
                                  batch_size=10,
-                                 batch_nums=10):
+                                 batch_nums=10,
+                                 is_data_loader=False):
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         scope = fluid.global_scope()
         val_reader = paddle.dataset.mnist.train()
 
+        def val_data_generator():
+            batches = []
+            for data in val_reader():
+                batches.append(data[0].reshape(1, 28, 28))
+                if len(batches) == batch_size:
+                    batches = np.asarray(batches)
+                    yield {"x": batches}
+                    batches = []
+
         ptq = PostTrainingQuantization(
             executor=exe,
             model_dir=model_path,
             model_filename='model.pdmodel',
             params_filename='model.pdiparams',
-            sample_generator=val_reader,
+            sample_generator=val_reader if not is_data_loader else None,
+            data_loader=val_data_generator if is_data_loader else None,
             batch_size=batch_size,
             batch_nums=batch_nums,
             algo=algo,
@@ -153,7 +164,8 @@ def run_test(self,
                  diff_threshold,
                  batch_size=10,
                  infer_iterations=10,
-                 quant_iterations=5):
+                 quant_iterations=5,
+                 is_data_loader=False):
 
         origin_model_path = self.download_model(data_url, data_md5, model_name)
         #origin_model_path = os.path.join(origin_model_path, model_name)
@@ -166,8 +178,15 @@ def run_test(self,
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model_name, quant_iterations * batch_size))
         self.generate_quantized_model(
-            origin_model_path, algo, quantizable_op_type, is_full_quantize,
-            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+            origin_model_path,
+            algo,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            batch_size,
+            quant_iterations,
+            is_data_loader=is_data_loader)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model_name, infer_iterations * batch_size))
@@ -307,6 +326,20 @@ def test_post_training_abs_max(self):
                       is_full_quantize, is_use_cache_file, is_optimize_model,
                       diff_threshold, batch_size, infer_iterations,
                       quant_iterations)
+        self.run_test(
+            model_name,
+            data_url,
+            data_md5,
+            algo,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            batch_size,
+            infer_iterations,
+            quant_iterations,
+            is_data_loader=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index 994f89ab3e9f3..f0dae081dd48f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -216,6 +216,141 @@ def test_quant_update_activation(self):
             graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
+    class TestQuant2Int8MkldnnPassNearestInterp(unittest.TestCase):
+        def op_name(self):
+            return "nearest_interp"
+
+        def setUp(self):
+            self.scope = fluid.Scope()
+            self.place = fluid.CPUPlace()
+            self.dtype = np.float32
+            self.use_cudnn = False
+            self.use_mkldnn = True
+
+            # conv2d
+            self.data_format = "ANYLAYOUT"
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [1, 3, 5, 5]
+            self.filter_size = [16, 3, 3, 3]
+            self.conv_output_size = [1, 16, 3, 3]
+            self.input = np.random.random(self.input_size).astype(self.dtype)
+            self.filter = np.random.random(self.filter_size).astype(self.dtype)
+            self.conv_output = np.ndarray(self.conv_output_size).astype(
+                self.dtype)
+
+            # nearest_interp
+            self.out_h = 1
+            self.out_w = 1
+            self.scale = 2.0
+            self.interp_method = 'nearest'
+            self.data_layout = 'NCHW'
+            self.nearest_interp_output_size = [1, 1, 2, 2]
+            self.nearest_interp_output = np.ndarray(
+                self.nearest_interp_output_size).astype(self.dtype)
+
+            # dropout
+            self.dropout_prob = 0.5
+            self.dropout_out = np.ndarray(
+                self.nearest_interp_output_size).astype(self.dtype)
+            self.dropout_mask = np.ndarray(self.nearest_interp_output_size)
+
+            self.quantized_ops = {
+                "conv2d", "nearest_interp", "nearest_interp_v2"
+            }
+            self.variables = {
+                "input": self.input,
+                "filter": self.filter,
+                "conv_output": self.conv_output,
+                "nearest_interp_output": self.nearest_interp_output,
+                "dropout_out": self.dropout_out,
+                'dropout_mask': self.dropout_mask
+            }
+
+        def prepare_program(self, program):
+            block = program.global_block()
+            for name in self.variables:
+                block.create_var(
+                    name=name,
+                    dtype="float32",
+                    shape=self.variables[name].shape)
+            block.append_op(
+                type="conv2d",
+                inputs={
+                    "Input": block.var('input'),
+                    'Filter': block.var('filter')
+                },
+                outputs={"Output": block.var('conv_output')},
+                attrs={
+                    'strides': self.stride,
+                    'paddings': self.pad,
+                    'groups': self.groups,
+                    'dilations': self.dilations,
+                    'use_cudnn': self.use_cudnn,
+                    'use_mkldnn': self.use_mkldnn,
+                    'data_format': self.data_format,
+                    'fuse_relu': True
+                })
+            block.append_op(
+                type=self.op_name(),
+                inputs={"X": block.var('conv_output'), },
+                outputs={"Out": block.var('nearest_interp_output')},
+                attrs={
+                    'interp_method': self.interp_method,
+                    'out_h': self.out_h,
+                    'out_w': self.out_w,
+                    'scale': self.scale,
+                    'data_layout': self.data_layout,
+                    'use_mkldnn': self.use_mkldnn
+                })
+            block.append_op(
+                type='dropout',
+                inputs={"X": block.var('nearest_interp_output'), },
+                outputs={
+                    'Out': block.var('dropout_out'),
+                    'Mask': block.var('dropout_mask')
+                },
+                attrs={'dropout_prob': self.dropout_prob, })
+
+        def check_graph_after_pass(self, graph):
+            for op in graph.all_op_nodes():
+                if op.op().type() in self.quantized_ops:
+                    self.assertTrue(op.op().has_attr("mkldnn_data_type"))
+                    self.assertTrue(op.op().attr("mkldnn_data_type") == "int8")
+
+        def test_quant_update_activation(self):
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                self.prepare_program(program)
+                graph = IrGraph(core.Graph(program.desc), for_test=True)
+                quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(
+                    self.quantized_ops,
+                    _scope=self.scope,
+                    _place=self.place,
+                    _core=core,
+                    _debug=False)
+
+                input_scale_tensor = quant2_int8_mkldnn_pass._convert_scale2tensor(
+                    np.array(self.scale).astype(np.float64))
+                output_scale_tensor = quant2_int8_mkldnn_pass._convert_scale2tensor(
+                    np.array(1. / self.scale * self.scale).astype(np.float64))
+                var_scale = {
+                    "input": (False, input_scale_tensor),
+                    "filter": (False, input_scale_tensor),
+                    "conv_output": (False, output_scale_tensor),
+                }
+                if core.avx_supported():
+                    quant2_int8_mkldnn_pass._var_quant_scales = var_scale
+                    graph = quant2_int8_mkldnn_pass._propagate_scales(graph)
+                    graph = quant2_int8_mkldnn_pass._quantize_fp32_graph(graph)
+                    self.check_graph_after_pass(graph)
+
+    class TestQuant2Int8MkldnnPassNearestInterpV2(unittest.TestCase):
+        def op_name(self):
+            return "nearest_interp_v2"
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 61e3a61fc9cd2..937fcdf0463be 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -16,12 +16,17 @@
 Functions for Auto SParsity (ASP) training and inference.
 """
 
+import os
 import copy
 import numpy as np
 import paddle
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid import core
+
+OpRole = core.op_proto_and_checker_maker.OpRole
+OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
 __all__ = [
     'decorate', 'prune_model', 'set_excluded_layers', 'reset_excluded_layers'
@@ -150,7 +155,8 @@ def prune_model(main_program=None,
                 n=2,
                 m=4,
                 mask_algo='mask_1d',
-                with_mask=True):
+                with_mask=True,
+                sharding=False):
     r"""
     Pruning parameters of supported layers in :attr:`main_program` via 
     specified mask generation function given by :attr:`mask_algo`. This 
@@ -173,6 +179,7 @@ def prune_model(main_program=None,
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
+        sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
@@ -214,8 +221,12 @@ def prune_model(main_program=None,
             # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
             sparsity.prune_model(main_program, mask_algo='mask_2d_best')
     """
-    device = paddle.device.get_device()
-    place = paddle.set_device(device)
+    if sharding:
+        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+        place = paddle.CUDAPlace(gpu_id)
+    else:
+        device = paddle.device.get_device()
+        place = paddle.set_device(device)
 
     MaskAlgo_mapping = {
         'mask_1d': sparsity.MaskAlgo.MASK_1D,
@@ -528,8 +539,11 @@ def _insert_sparse_mask_ops(cls, main_program, param_grads):
                         'Y': asp_info.mask_vars[param_grad[0].name]
                     },
                     outputs={'Out': param_grad[0]},
-                    attrs={'axis': -1,
-                           'use_mkldnn': False})
+                    attrs={
+                        'axis': -1,
+                        'use_mkldnn': False,
+                        OP_ROLE_KEY: OpRole.Optimize
+                    })
 
 
 class OptimizerWithSparsityGuarantee(object):
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index a9cfe0babec0f..9b99e17e9e51c 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -268,6 +268,7 @@ def to_list(s):
         from .core_avx import _is_dygraph_debug_enabled
         from .core_avx import _dygraph_debug_level
         from .core_avx import _switch_tracer
+        from .core_avx import _set_eager_tracer
         from .core_avx import _disable_eager_mode
         from .core_avx import _enable_eager_mode
         from .core_avx import _in_eager_mode
@@ -324,6 +325,7 @@ def to_list(s):
         from .core_noavx import _is_dygraph_debug_enabled
         from .core_noavx import _dygraph_debug_level
         from .core_noavx import _switch_tracer
+        from .core_noavx import _set_eager_tracer
         from .core_noavx import _disable_eager_mode
         from .core_noavx import _enable_eager_mode
         from .core_noavx import _in_eager_mode
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 78c329d96a82b..bd7a870afe3b6 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -22,7 +22,7 @@
 import multiprocessing
 import warnings
 
-from .framework import Variable, default_main_program, _current_expected_place, in_dygraph_mode
+from .framework import Variable, default_main_program, _current_expected_place, in_dygraph_mode, _in_eager_mode
 from .framework import _cpu_num, _cuda_ids
 __all__ = ['DataFeeder']
 
@@ -102,12 +102,20 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''):
         if not isinstance(expected_type, tuple):
             expected_type = (expected_type, )
         expected_type += (core.VarBase, )
+        #  TODO(jiabin): uncomment it when we support declarative mode in eager
+        # if _in_eager_mode():
+        #     expected_type += (core.eager.EagerTensor, )
     elif isinstance(input, core.VarBase):
         raise TypeError(
             "Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. "
             "Because received '{}' in {} is a imperative Variable.".format(
                 input_name, op_name))
-
+    elif hasattr(core, "eager"):
+        if isinstance(input, core.eager.EagerTensor):
+            raise TypeError(
+                "Please use `with fluid.dygraph.guard()` as context or `fluid.enable_dygraph()` to switch to imperative mode firstly. "
+                "Because received '{}' in {} is a imperative Variable.".format(
+                    input_name, op_name))
     if not isinstance(input, expected_type):
         raise TypeError(
             "The type of '%s' in %s must be %s, but received %s. %s" %
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 642ac5e26a25e..15adf4cb6faaf 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -71,7 +71,9 @@
 }
 
 PURE_FP16_WHITE_LIST = {' '}
-PURE_FP16_BLACK_LIST = {'lookup_table', 'lookup_table_v2'}
+PURE_FP16_BLACK_LIST = {
+    'lookup_table', 'lookup_table_v2', 'scatter', 'scatter_grad'
+}
 
 
 #NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list
@@ -128,12 +130,12 @@ def pure_fp16_initialize(models):
     for idx in range(len(models)):
         for layer in models[idx].sublayers(include_self=True):
             layer._casted_by_pure_fp16 = True
-            if len(layer._sub_layers) is 0:
-
-                if (layer._dtype is 'float16') or isinstance(layer, (
-                        paddle.nn.BatchNorm, paddle.nn.LayerNorm)):
-                    continue
-                layer.to(dtype='float16')
+            if (layer._dtype is 'float16') or isinstance(
+                    layer, (paddle.nn.BatchNorm, paddle.nn.BatchNorm1D,
+                            paddle.nn.BatchNorm2D, paddle.nn.BatchNorm3D,
+                            paddle.nn.LayerNorm)):
+                continue
+            layer._to_impl(dtype='float16', include_sublayers=False)
     return models
 
 
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index f54a1629196a0..9234577b8cc23 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -25,7 +25,7 @@
 import logging
 from ..data_feeder import convert_dtype
 import warnings
-from ..framework import _get_paddle_place
+from ..framework import _get_paddle_place, _in_eager_mode
 import paddle
 
 __all__ = [
@@ -720,10 +720,16 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
             if value.dtype != dtype:
                 value = value.astype(dtype)
 
-        py_var = core.VarBase(
-            value=value,
-            place=framework._current_expected_place(),
-            persistable=False,
-            zero_copy=zero_copy,
-            name=name if name else '')
-        return py_var
+        if _in_eager_mode():
+            return core.eager.EagerTensor(value,
+                                          framework._current_expected_place(),
+                                          False, zero_copy, name
+                                          if name else None, True)
+        else:
+            py_var = core.VarBase(
+                value=value,
+                place=framework._current_expected_place(),
+                persistable=False,
+                zero_copy=zero_copy,
+                name=name if name else '')
+            return py_var
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index c25574c39dafe..30012fb8666fc 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -394,3 +394,34 @@ def _set_spec_stop_gradient(spec, stop_gradient):
     """
     assert isinstance(spec, paddle.static.InputSpec)
     spec.stop_gradient = stop_gradient
+
+
+def _hash_spec_names(args_specs, kwargs_specs):
+    """
+    Generater hash spec with args/kwargs InputSpec names.
+    Consider the following InputSpecs with same shape/dtype except for name:
+      1. [InputSpec([3,3], 'float32', 'x'), InputSpec([3,3], 'float32', 'x')]
+      2. [InputSpec([3,3], 'float32', 'x'), InputSpec([3,3], 'float32', 'y')]
+    Under @to_static, we should generate two different program not just one, because
+    the former has one input ('x'), but the latter has two input ('x', 'y').
+    """
+    spec_names = [
+        spec.name for spec in flatten(args_specs)
+        if isinstance(spec, paddle.static.InputSpec)
+    ]
+    spec_names += [
+        spec.name for spec in flatten(kwargs_specs)
+        if isinstance(spec, paddle.static.InputSpec)
+    ]
+    i, name_ids = 0, {}
+
+    def to_idx(name):
+        nonlocal i
+        if name not in name_ids:
+            name_ids[name] = i
+            i += 1
+        return name_ids[name]
+
+    value = [to_idx(name) for name in spec_names]
+
+    return tuple(value)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 19479a190c3b9..f8800f3037b40 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -43,7 +43,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
 from paddle.fluid.dygraph.dygraph_to_static.utils import make_hashable
-from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec
+from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec, _hash_spec_names
 from paddle.fluid.dygraph.dygraph_to_static.function_spec import get_buffers, get_parameters
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 
@@ -147,7 +147,7 @@ class CacheKey(object):
     """
     __slots__ = [
         'function_spec', 'input_args_with_spec', 'input_kwargs_with_spec',
-        'class_instance', 'kwargs'
+        'class_instance', 'kwargs', '_spec_names_id'
     ]
 
     def __init__(self, function_spec, input_args_with_spec,
@@ -168,6 +168,8 @@ def __init__(self, function_spec, input_args_with_spec,
         self.class_instance = class_instance
         # NOTE: `kwargs` is usually not considered as basic member for `__hash__`
         self.kwargs = kwargs
+        self._spec_names_id = _hash_spec_names(input_args_with_spec,
+                                               input_kwargs_with_spec)
 
     @classmethod
     def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
@@ -197,7 +199,7 @@ def __hash__(self):
         return hash((id(self.function_spec),
                      make_hashable(self.input_args_with_spec, error_msg),
                      make_hashable(self.input_kwargs_with_spec, error_msg),
-                     self.class_instance))
+                     self._spec_names_id, self.class_instance))
 
     def __eq__(self, other):
         return (type(self) is type(other)) and hash(self) == hash(other)
@@ -703,6 +705,7 @@ class ProgramCache(object):
     """
 
     def __init__(self):
+        # {hash_id : (concrete_program, partial_layer)}
         self._caches = collections.OrderedDict()
 
     def _build_once(self, cache_key):
@@ -718,9 +721,9 @@ def __getitem__(self, item):
         if not isinstance(item, CacheKey):
             raise ValueError('type(item) should be CacheKey, but received %s' %
                              type_name(item))
-
-        if item not in self._caches:
-            self._caches[item] = self._build_once(item)
+        item_id = hash(item)
+        if item_id not in self._caches:
+            self._caches[item_id] = self._build_once(item)
             # Note: raise warnings if number of traced program is more than `max_tracing_count`
             current_tracing_count = len(self._caches)
             if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
@@ -729,18 +732,19 @@ def __getitem__(self, item):
                     "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
                     format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
 
-        return self._caches[item]
+        return self._caches[item_id]
 
     def get_program(self, item):
         if not isinstance(item, CacheKey):
             raise ValueError(
                 "Input item's type should be FunctionSpec, but received %s" %
                 type_name(item))
-        if item not in self._caches:
+        item_id = hash(item)
+        if item_id not in self._caches:
             raise RuntimeError(
                 "Failed to find program for input item, please decorate input function by `@paddle.jit.to_static`."
             )
-        return self._caches[item]
+        return self._caches[item_id]
 
     def last(self):
         assert len(
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 3fd690cab058a..9ffdea969be5d 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -1077,8 +1077,13 @@ def append_var_from_block_desc_static(block,
             else:
                 lod_level = None
 
+            if var_desc.persistable():
+                current_block = block.program.global_block()
+            else:
+                current_block = block
+
             vars_append.append(
-                block.create_var(
+                current_block.create_var(
                     name=var_desc.name(),
                     dtype=data_type,
                     type=var_type,
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index 5bf5eda19a5d0..4ad575d325bc9 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -21,6 +21,7 @@
 from .. import core
 from six.moves import zip
 from ..layer_helper_base import LayerHelperBase
+from ..dygraph_utils import _append_activation_in_dygraph
 
 
 class LayerObjectHelper(LayerHelperBase):
@@ -162,14 +163,18 @@ def append_activation(self, input_var, act=None, use_cudnn=None):
         if (use_mkldnn is not None) and use_mkldnn:
             act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
-
-        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        self.append_op(
-            type=act_type,
-            inputs={"X": [input_var]},
-            outputs={"Out": [tmp]},
-            attrs=act)
-        return tmp
+        if in_dygraph_mode():
+            res = _append_activation_in_dygraph(input_var, act_type, use_cudnn,
+                                                use_mkldnn)
+            return res
+        else:
+            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
+            self.append_op(
+                type=act_type,
+                inputs={"X": [input_var]},
+                outputs={"Out": [tmp]},
+                attrs=act)
+            return tmp
 
     def is_instance(self, param, cls):
         """Check if the input parameter is instance of input class
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index df2a00978aa67..4a60bdc4c72d3 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -31,7 +31,7 @@
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
 from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
-from .base import program_desc_tracing_guard, param_guard, in_declarative_mode
+from .base import program_desc_tracing_guard, param_guard, in_declarative_mode, _convert_into_variable
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
@@ -914,16 +914,7 @@ def _dygraph_call_func(self, *inputs, **kwargs):
         return outputs
 
     def __call__(self, *inputs, **kwargs):
-        # NOTE(Aurelius84): Why we still need param_guard here?
-        # In case of ControlFlow, true_fn and false_fn will contain
-        # parameters that may not trigger logic of `Operator` to create
-        # them. we add this to make sure all parameters is available.
-
-        if in_declarative_mode() and not framework.in_dygraph_mode():
-            with param_guard(self._parameters), param_guard(self._buffers):
-                return self._dygraph_call_func(*inputs, **kwargs)
-        else:
-            return self._dygraph_call_func(*inputs, **kwargs)
+        return self._dygraph_call_func(*inputs, **kwargs)
 
     def forward(self, *inputs, **kwargs):
         """
@@ -1103,6 +1094,8 @@ def __getattr__(self, name):
         if '_parameters' in self.__dict__:
             _parameters = self.__dict__['_parameters']
             if name in self._parameters:
+                if in_declarative_mode():
+                    return _convert_into_variable(self._parameters[name])
                 return self._parameters[name]
         if '_sub_layers' in self.__dict__:
             _sub_layers = self.__dict__['_sub_layers']
@@ -1111,6 +1104,8 @@ def __getattr__(self, name):
         if '_buffers' in self.__dict__:
             _buffers = self.__dict__['_buffers']
             if name in _buffers:
+                if in_declarative_mode():
+                    return _convert_into_variable(_buffers[name])
                 return _buffers[name]
         return object.__getattribute__(self, name)
 
@@ -1181,11 +1176,16 @@ def _remove_if_exist(*dicts):
                         # but should all non-Variable _buffers[name] be re-assign? We
                         # should consider it in the future. I current wrote this as
                         # conservative code.
-                        if _buffers[name] is None or type(_buffers[
-                                name]) == core.VarBase:
+                        if in_declarative_mode() and _buffers[name] is None:
+                            raise RuntimeError(
+                                'In Dy2stat, self.{0} is a buffer and self.{0} is '
+                                'not allowed to be set to Variable when self.{0} is None.'.
+                                format(name))
+                        elif _buffers[name] is None or type(
+                                getattr(self, name)) == core.VarBase:
                             _buffers[name] = assign(value)
                         else:
-                            assign(value, _buffers[name])
+                            assign(value, getattr(self, name))
                     elif value is not None:
                         raise TypeError(
                             "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
@@ -1276,11 +1276,11 @@ def register_state_dict_hook(self, hook):
         self._state_dict_hooks[hook_remove_helper._hook_id] = hook
         return hook_remove_helper
 
-    def _state_dict_impl(self,
-                         destination=None,
-                         include_sublayers=True,
-                         structured_name_prefix="",
-                         include_non_persistable_buffer=False):
+    def _obtain_parameters_buffers(self,
+                                   destination=None,
+                                   include_sublayers=True,
+                                   structured_name_prefix="",
+                                   include_non_persistable_buffer=False):
         """
         Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
 
@@ -1313,7 +1313,16 @@ def _state_dict_impl(self,
                             structured_name_prefix + layer_name + ".",
                             include_non_persistable_buffer))
                     destination = destination_temp
+        return destination
 
+    def _state_dict_impl(self,
+                         destination=None,
+                         include_sublayers=True,
+                         structured_name_prefix="",
+                         include_non_persistable_buffer=False):
+        destination = self._obtain_parameters_buffers(
+            destination, include_sublayers, structured_name_prefix,
+            include_non_persistable_buffer)
         for state_dict_hook in self._state_dict_hooks.values():
             hook_result = state_dict_hook(destination)
             if hook_result is not None:
@@ -1470,23 +1479,6 @@ def _set_var(var, ndarray):
             for param, state in matched_param_state:
                 _set_var(param, state)
 
-    def _apply(self, func, device, dtype, blocking):
-        for layer in self.children():
-            layer._apply(func, device, dtype, blocking)
-
-        for key, param in self._parameters.items():
-            if param is not None:
-                with no_grad():
-                    param_applied = func(param, device, dtype, blocking)
-
-                if param.grad is not None:
-                    with no_grad():
-                        grad_applied = func(param._grad_ivar(), device, dtype,
-                                            blocking)
-
-        for key, buf in self._buffers.items():
-            self._buffers[key] = func(buf, device, dtype, blocking)
-
     def to(self, device=None, dtype=None, blocking=None):
         '''
         Cast the parameters and buffers of Layer by the give device, dtype and blocking.
@@ -1500,7 +1492,7 @@ def to(self, device=None, dtype=None, blocking=None):
 
             blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
-
+            
         Returns:
             self
 
@@ -1534,6 +1526,53 @@ def to(self, device=None, dtype=None, blocking=None):
                 #       [[-0.04989364, -0.56889004],
                 #        [ 0.33960250,  0.96878713]])
 
+        '''
+        return self._to_impl(
+            device=device,
+            dtype=dtype,
+            blocking=blocking,
+            include_sublayers=True)
+
+    def _apply(self, func, device, dtype, blocking, include_sublayers=True):
+        if include_sublayers:
+            for layer in self.children():
+                layer._apply(func, device, dtype, blocking, include_sublayers)
+
+        for key, param in self._parameters.items():
+            if param is not None:
+                with no_grad():
+                    param_applied = func(param, device, dtype, blocking)
+
+                if param.grad is not None:
+                    with no_grad():
+                        grad_applied = func(param._grad_ivar(), device, dtype,
+                                            blocking)
+
+        for key, buf in self._buffers.items():
+            self._buffers[key] = func(buf, device, dtype, blocking)
+
+    def _to_impl(self,
+                 device=None,
+                 dtype=None,
+                 blocking=None,
+                 include_sublayers=True):
+        '''
+        Cast the parameters and buffers of Layer by the give device, dtype and blocking.
+
+        Parameters:
+            device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored.
+            If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the
+            index of the GPUs or XPUs. Default: None.
+
+            dtype(str|numpy.dtype|paddle.dtype|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None.
+
+            blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
+              asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
+            
+            include_sublayers(bool|True, optional): If True, deal with self and all sublayers parameters and buffers, if not only deal with self parameters and buffers. Default: True.
+
+        Returns:
+            self
 
         '''
 
@@ -1610,7 +1649,7 @@ def transform(t, device, dtype, blocking):
 
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=UserWarning)
-            self._apply(transform, device, dtype, blocking)
+            self._apply(transform, device, dtype, blocking, include_sublayers)
 
         self._dtype = dtype
         return self
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 3731976ad18ab..64c418fabb11f 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -18,6 +18,7 @@
 from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
 from . import no_grad
+from ..framework import _in_eager_mode
 
 import numpy as np
 import warnings
@@ -59,6 +60,7 @@
 ]
 
 _already_patch_varbase = False
+_already_patch_eager_tensor = False
 
 
 def monkey_patch_math_varbase():
@@ -219,7 +221,11 @@ def __impl__(self, other_var):
 
             # 2. create varbase for scalar
             lhs_dtype = self.dtype
-            if not isinstance(other_var, core.VarBase):
+            if _in_eager_mode():
+                other_var_should_be = core.eager.EagerTensor
+            else:
+                other_var_should_be = core.VarBase
+            if not isinstance(other_var, other_var_should_be):
                 if isinstance(other_var, complex):
                     import paddle
                     other_var = paddle.to_tensor(other_var, dtype='complex64')
@@ -332,21 +338,30 @@ def __impl__(self, other_var):
     ]
 
     global _already_patch_varbase
-    if not _already_patch_varbase:
+    global _already_patch_eager_tensor
+
+    if core._in_eager_mode():
+        local_already_patch = _already_patch_eager_tensor
+        _already_patch_eager_tensor = True
+        local_tensor = core.eager.EagerTensor
+    else:
+        local_already_patch = _already_patch_varbase
+        _already_patch_varbase = True
+        local_tensor = core.VarBase
+
+    if not local_already_patch:
         for method in varbase_methods:
             method_name = method[0]
             method_impl = method[1]
-            setattr(core.VarBase, method_name, method_impl)
+            setattr(local_tensor, method_name, method_impl)
     else:
         import paddle.tensor
         # Tensor method from module paddle.tensor
         for method_name in paddle.tensor.tensor_method_func:
-            if hasattr(core.VarBase, method_name): continue
+            if hasattr(local_tensor, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
-            if method_impl: setattr(core.VarBase, method_name, method_impl)
+            if method_impl: setattr(local_tensor, method_name, method_impl)
 
         for magic_method, origin_method in paddle.tensor.magic_method_func:
             impl = getattr(paddle.tensor, origin_method, None)
-            if impl: setattr(core.VarBase, magic_method, impl)
-
-    _already_patch_varbase = True
+            if impl: setattr(local_tensor, magic_method, impl)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 81bb812202c9e..ddb86848f842a 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -356,7 +356,7 @@ def sync_params_buffers(model,
                         src_rank=0,
                         is_model_parallel=False):
     model_vars = []
-    for _, param in model.state_dict().items():
+    for _, param in model._obtain_parameters_buffers().items():
         if not isinstance(param, core.VarBase):
             raise TypeError("The data type of '%s' must be Varbase" %
                             param.name)
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 2047968085b3a..2ecb0998dd355 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -39,10 +39,16 @@ def __init__(self):
 
         self._train_mode = True
 
-    def trace_op(self, type, inputs, outputs, attrs, stop_gradient=False):
+    def trace_op(self,
+                 type,
+                 inputs,
+                 outputs,
+                 attrs,
+                 stop_gradient=False,
+                 inplace_map=None):
         self.trace(type, inputs, outputs, attrs,
                    framework._current_expected_place(), self._has_grad and
-                   not stop_gradient)
+                   not stop_gradient, inplace_map if inplace_map else {})
 
     def train_mode(self):
         self._train_mode = True
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index f308af04e5e58..c61f87ccf9089 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -22,7 +22,7 @@
 from .. import framework
 from .. import core
 from .. import unique_name
-from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_
+from ..framework import Variable, Parameter, ParamBase, _getitem_impl_, _setitem_impl_, _in_eager_mode
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
@@ -58,6 +58,9 @@ def remove(self):
         return False
 
 
+_already_patch_repr = False
+
+
 def monkey_patch_varbase():
     @switch_to_static_graph
     def _to_static_var(self, to_parameter=False, **kwargs):
@@ -146,7 +149,11 @@ def set_value(self, value):
                     out = linear(t)  # call with different weight
 
         """
-        assert isinstance(value, (np.ndarray, core.VarBase, dict, str)), \
+        if _in_eager_mode():
+            base_tensor = core.eager.EagerTensor
+        else:
+            base_tensor = core.VarBase
+        assert isinstance(value, (np.ndarray, base_tensor, dict, str)), \
             "Variable set_value function, arguments type only support Variable, numpy, VarBase, dict, string."
 
         if isinstance(value, (dict, str)):
@@ -160,7 +167,7 @@ def set_value(self, value):
                 self.value().set_string_list(value)
         else:
             value_np = value
-            if isinstance(value, core.VarBase):
+            if isinstance(value, base_tensor):
                 value_np = value.numpy()
 
             self_tensor_np = self.numpy()
@@ -231,22 +238,40 @@ def backward(self, grad_tensor=None, retain_graph=False):
         """
         if framework.in_dygraph_mode():
             if grad_tensor is not None:
-                assert isinstance(
-                    grad_tensor, paddle.
-                    Tensor), "The type of grad_tensot must be paddle.Tensor"
+                if _in_eager_mode():
+                    assert isinstance(
+                        grad_tensor, core.eager.EagerTensor
+                    ), "The type of grad_tensor must be paddle.Tensor"
+                else:
+                    assert isinstance(
+                        grad_tensor, paddle.
+                        Tensor), "The type of grad_tensor must be paddle.Tensor"
                 assert grad_tensor.shape == self.shape, \
                     "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
                     grad_tensor.name, grad_tensor.shape, self.name, self.shape)
 
+            if _in_eager_mode():
+                if grad_tensor is None:
+                    grad_tensor = []
+                else:
+                    grad_tensor = [grad_tensor]
             if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                core.dygraph_run_backward([scaled_loss], [grad_tensor],
-                                          retain_graph,
-                                          framework._dygraph_tracer())
+                if _in_eager_mode():
+                    core.eager.run_backward([scaled_loss], grad_tensor,
+                                            retain_graph)
+                else:
+                    core.dygraph_run_backward([scaled_loss], [grad_tensor],
+                                              retain_graph,
+                                              framework._dygraph_tracer())
             else:
-                core.dygraph_run_backward([self], [grad_tensor], retain_graph,
-                                          framework._dygraph_tracer())
+                if _in_eager_mode():
+                    core.eager.run_backward([self], grad_tensor, retain_graph)
+                else:
+                    core.dygraph_run_backward([self], [grad_tensor],
+                                              retain_graph,
+                                              framework._dygraph_tracer())
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -280,15 +305,22 @@ def gradient(self):
                 # [500.]
 
         """
-        if self._grad_ivar() is None:
-            return None
+        if _in_eager_mode():
+            if not self.grad._is_initialized():
+                return None
+            # TODO(wanghuancoder) support SELECTED_ROWS
+            return self.grad.numpy()
+        else:
+            if self._grad_ivar() is None:
+                return None
 
-        new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True)
-        if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
-            return (np.array(new_ivar.value().get_selected_rows().get_tensor()),
+            new_ivar = self._grad_ivar()._copy_to(core.CPUPlace(), True)
+            if self._grad_ivar().type == core.VarDesc.VarType.SELECTED_ROWS:
+                return (
+                    np.array(new_ivar.value().get_selected_rows().get_tensor()),
                     np.array(new_ivar.value().get_selected_rows().rows()))
-        else:
-            return np.array(new_ivar.value().get_tensor())
+            else:
+                return np.array(new_ivar.value().get_tensor())
 
     @framework.dygraph_only
     def register_hook(self, hook):
@@ -555,8 +587,12 @@ def __str__(self):
                 #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
                 #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
         """
-        from paddle.tensor.to_string import to_string
-        return to_string(self)
+        if _in_eager_mode():
+            from paddle.tensor.to_string import eager_tensor_to_string
+            return eager_tensor_to_string(self)
+        else:
+            from paddle.tensor.to_string import to_string
+            return to_string(self)
 
     def __deepcopy__(self, memo):
         """
@@ -583,7 +619,10 @@ def __deepcopy__(self, memo):
             raise RuntimeError(
                 "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy"
             )
-        new_varbase = core.VarBase()
+        if _in_eager_mode():
+            new_varbase = core.eager.EagerTensor()
+        else:
+            new_varbase = core.VarBase()
         new_varbase.name = self.name + unique_name.generate("_deepcopy")
         memo[id(self)] = new_varbase
         new_varbase.copy_(self, True)
@@ -717,33 +756,62 @@ def is_combine_index(item):
             # Call c++ func __setitem_varbase__ to speedup.
             return self.__setitem_varbase__(item, value)
 
+    @framework.dygraph_only
+    def _grad_ivar(self):
+        if self.grad._is_initialized():
+            return self.grad
+        else:
+            return None
+
+    @framework.dygraph_only
+    def clear_gradient(self, set_to_zero=True):
+        if set_to_zero:
+            self._zero_grads()
+        else:
+            self._clear_gradient()
+
+    if core._in_eager_mode() and not hasattr(core, "eager"):
+        return
+
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
         ("_to_static_var", _to_static_var), ("set_value", set_value),
         ("block", block), ("backward", backward), ("clear_grad", clear_grad),
-        ("inplace_version", inplace_version), ("grad", grad),
-        ("gradient", gradient), ("register_hook", register_hook),
-        ("__str__", __str__), ("__repr__", __str__),
-        ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
-        ("__name__", "Tensor"), ("__array__", __array__),
+        ("inplace_version", inplace_version), ("gradient", gradient),
+        ("register_hook", register_hook), ("__str__", __str__),
+        ("__repr__", __str__), ("__deepcopy__", __deepcopy__),
+        ("__module__", "paddle"), ("__array__", __array__),
         ("__getitem__", __getitem__), ("item", item),
         ("__setitem__", __setitem__), ("_to", _to)):
-        setattr(core.VarBase, method_name, method)
-
-    # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
-    # So, we need to overwrite it to a more readable one.
-    # See details in https://github.com/pybind/pybind11/issues/2537.
-    origin = getattr(core.VarDesc.VarType, "__repr__")
-
-    def dtype_str(dtype):
-        if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE:
-            prefix = 'paddle.'
-            return prefix + _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
+        if core._in_eager_mode():
+            setattr(core.eager.EagerTensor, method_name, method)
         else:
-            # for example, paddle.fluid.core.VarDesc.VarType.LOD_TENSOR
-            return origin(dtype)
+            setattr(core.VarBase, method_name, method)
+
+    if core._in_eager_mode():
+        setattr(core.eager.EagerTensor, "_grad_ivar", _grad_ivar)
+        setattr(core.eager.EagerTensor, "clear_gradient", clear_gradient)
+    else:
+        setattr(core.VarBase, "__name__", "Tensor")
+        setattr(core.VarBase, "grad", grad)
+
+    global _already_patch_repr
+    if not _already_patch_repr:
+        # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
+        # So, we need to overwrite it to a more readable one.
+        # See details in https://github.com/pybind/pybind11/issues/2537.
+        origin = getattr(core.VarDesc.VarType, "__repr__")
+
+        def dtype_str(dtype):
+            if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE:
+                prefix = 'paddle.'
+                return prefix + _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
+            else:
+                # for example, paddle.fluid.core.VarDesc.VarType.LOD_TENSOR
+                return origin(dtype)
 
-    setattr(core.VarDesc.VarType, "__repr__", dtype_str)
+        setattr(core.VarDesc.VarType, "__repr__", dtype_str)
+        _already_patch_repr = True
 
     # patch math methods for varbase
     monkey_patch_math_varbase()
diff --git a/python/paddle/fluid/eager/__init__.py b/python/paddle/fluid/eager/__init__.py
deleted file mode 100644
index 1dc82ef69979c..0000000000000
--- a/python/paddle/fluid/eager/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# incubate directory is mainly for internal use
-# after we have tested incubate APIs in industrial application for a period
-# we will move stable functions into fluid
-
-from . import eager_tensor_patch_methods
-
-__all__ = []
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9d167cf5336a3..ff589774c5170 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1385,23 +1385,24 @@ def _can_use_interpreter_core(program, place):
                 key = _get_strong_program_cache_key(inner_program, feed,
                                                     fetch_list)
 
-                program = self._add_feed_fetch_ops(
-                    program=inner_program,
-                    feed=feed,
-                    fetch_list=fetch_list,
-                    feed_var_name=feed_var_name,
-                    fetch_var_name=fetch_var_name,
-                    use_fetch_v2=True)
-
                 # a little bit tricy here, use inner_program before _add_feed_fetch_ops to get key
                 # while use program to geet _StandaloneExecutor
                 if key not in self._executor_cache._cached_executors:
+                    program = self._add_feed_fetch_ops(
+                        program=inner_program,
+                        feed=feed,
+                        fetch_list=fetch_list,
+                        feed_var_name=feed_var_name,
+                        fetch_var_name=fetch_var_name,
+                        use_fetch_v2=True)
+
                     new_program = program.clone()
                     new_exe = _StandaloneExecutor(self.place, new_program,
                                                   scope)
-                    self._executor_cache._cached_executors[key] = new_exe
+                    self._executor_cache._cached_executors[key] = (new_program,
+                                                                   new_exe)
 
-                new_exe = self._executor_cache._cached_executors[key]
+                program, new_exe = self._executor_cache._cached_executors[key]
 
                 self._feed_data(program, feed, feed_var_name, scope)
                 if hasattr(program, 'lr_sheduler'):
@@ -1415,7 +1416,10 @@ def _can_use_interpreter_core(program, place):
                         [lr_value]).astype(convert_dtype(lr_var.dtype))
                     tensor = core.get_variable_tensor(scope,
                                                       lr_sheduler._var_name)
-                    tensor.set(data, self.place)
+                    # NOTE(dev): `set` always call TensorCopySync that is a 
+                    # blocking behavior. So we use `_copy_from` to replace it.
+                    cpu_tensor = _as_lodtensor(data, core.CPUPlace())
+                    tensor._copy_from(cpu_tensor, self.place)
 
                 return new_exe.run(list(feed.keys()), fetch_list, return_numpy)
 
@@ -1967,7 +1971,11 @@ def _get_real_program_fetch_list():
 
         return ctx
 
-    def _prepare_fleet_executor(self, program=None, scope=None, fleet_opt=None):
+    def _prepare_fleet_executor(self,
+                                carrier_id="",
+                                program=None,
+                                scope=None,
+                                fleet_opt=None):
         from ..distributed.fleet.proto import fleet_executor_desc_pb2
         assert program, "Program for fleet executor should not be None"
         assert fleet_opt, "Configurations for fleet executor should not be None"
@@ -2025,7 +2033,8 @@ def _prepare_fleet_executor(self, program=None, scope=None, fleet_opt=None):
         fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
         place = core.Place()
         place.set_place(self.place)
-        fleet_exe.init(program.desc, scope, place, tasks, task_id_to_rank)
+        fleet_exe.init(carrier_id, program.desc, scope, place, tasks,
+                       task_id_to_rank)
         return fleet_exe
 
     def _run_using_fleet_executor(self,
@@ -2034,6 +2043,7 @@ def _run_using_fleet_executor(self,
                                   feed_var_name="feed",
                                   fetch_var_name="fetch",
                                   fetch_list=None):
+        # TODO(liyurui): Change cache strategy for multi carriers
         cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
         cached_ctx = self._get_ctx_cache(cache_key)
         cached_scope = self._get_scope_cache(cache_key)
@@ -2099,7 +2109,10 @@ def _run_using_fleet_executor(self,
                 fetch_task.set_program(fetch_program)
 
             cached_ctx = self._prepare_fleet_executor(
-                program=cached_program, scope=cached_scope, fleet_opt=fleet_opt)
+                cache_key,
+                program=cached_program,
+                scope=cached_scope,
+                fleet_opt=fleet_opt)
             self._add_ctx_cache(cache_key, cached_ctx)
         if feed:
             # NOTE: don't have to traverse programs in task nodes,
@@ -2118,7 +2131,7 @@ def _run_using_fleet_executor(self,
                                               lr_sheduler._var_name)
             tensor.set(data, self.place)
 
-        cached_ctx.run()
+        cached_ctx.run(cache_key)
         if fetch_list:
             arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
             tensors = arr._move_to_list()
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 076f39befbb9c..b306edb8fb2ae 100755
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -53,6 +53,7 @@
     'cpu_places',
     'xpu_places',
     'ipu_places',
+    'mlu_places',
     'cuda_pinned_places',
     'in_dygraph_mode',
     'is_compiled_with_cinn',
@@ -79,20 +80,33 @@
 _current_device = None
 global_prog_seed = 0
 _current_pipeline_stage = None
+_already_patch_eager_tensor = False
 _global_flags_ = core.globals()
 core._disable_eager_mode()
 
 
 @signature_safe_contextmanager
-def _test_eager_guard():
+def _test_eager_guard(tracer=None):
     core._enable_eager_mode()
     _C_ops.switch_to_eager_ops()
+    global _already_patch_eager_tensor
+    if not _already_patch_eager_tensor:
+        from .dygraph.varbase_patch_methods import monkey_patch_varbase
+        monkey_patch_varbase()
+        from .dygraph import monkey_patch_math_varbase
+        monkey_patch_math_varbase()
+        _already_patch_eager_tensor = True
+    if tracer is None:
+        core._set_eager_tracer(_dygraph_tracer_)
+    else:
+        core._set_eager_tracer(tracer)
     try:
         yield
     finally:
         core._disable_eager_mode()
         _C_ops.switch_to_core_ops()
 
+
 global_ipu_index = None
 global_ipu_stage = None
 ipu_index_attr_name = 'ipu_index'
@@ -373,6 +387,18 @@ def _current_expected_place():
                     "You are using XPU version Paddle, but your XPU device is not set properly. CPU device will be used by default."
                 )
                 _global_expected_place_ = core.CPUPlace()
+        elif core.is_compiled_with_mlu():
+            try:
+                device_count = core.get_mlu_device_count()
+            except Exception as e:
+                device_count = 0
+            if device_count > 0:
+                _global_expected_place_ = core.MLUPlace(0)
+            else:
+                warnings.warn(
+                    "You are using MLU version Paddle, but your MLU device is not set properly. CPU device will be used by default."
+                )
+                _global_expected_place_ = core.CPUPlace()
         else:
             _global_expected_place_ = core.CPUPlace()
 
@@ -452,6 +478,15 @@ def _npu_ids():
     return device_ids
 
 
+def _mlu_ids():
+    mlus_env = os.getenv("FLAGS_selected_mlus")
+    if mlus_env:
+        device_ids = [int(s) for s in mlus_env.split(",")]
+    else:
+        device_ids = six.moves.range(core.get_mlu_device_count())
+    return device_ids
+
+
 def is_compiled_with_xpu():
     """
     Whether this whl package can be used to run the model on XPU.
@@ -782,6 +817,48 @@ def cuda_pinned_places(device_count=None):
     return [core.CUDAPinnedPlace()] * device_count
 
 
+def mlu_places(device_ids=None):
+    """
+    **Note**:
+        For multi-card tasks, please use `FLAGS_selected_mlus` environment variable to set the visible MLU device.
+        This function creates a list of :code:`paddle.device.MLUPlace` objects.
+        If :code:`device_ids` is None, environment variable of
+        :code:`FLAGS_selected_mlus` would be checked first. For example, if
+        :code:`FLAGS_selected_mlus=0,1,2`, the returned list would
+        be [paddle.device.MLUPlace(0), paddle.device.MLUPlace(1), paddle.device.MLUPlace(2)].
+        If :code:`FLAGS_selected_mlus` is not set, all visible
+        mlu places would be returned.
+        If :code:`device_ids` is not None, it should be the device
+        ids of MLUs. For example, if :code:`device_ids=[0,1,2]`,
+        the returned list would be
+        [paddle.device.MLUPlace(0), paddle.device.MLUPlace(1), paddle.device.MLUPlace(2)].
+
+    Parameters:
+        device_ids (list or tuple of int, optional): list of MLU device ids.
+
+    Returns:
+        list of paddle.device.MLUPlace: Created MLU place list.
+
+    Examples:
+        .. code-block:: python
+
+            # required: mlu
+
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+            mlu_places = static.mlu_places()
+    """
+    assert core.is_compiled_with_mlu(), \
+        "Not compiled with MLU"
+    if device_ids is None:
+        device_ids = _mlu_ids()
+    elif not isinstance(device_ids, (list, tuple)):
+        device_ids = [device_ids]
+    return [core.MLUPlace(dev_id) for dev_id in device_ids]
+
+
 class NameScope(object):
     def __init__(self, name="", parent=None):
         self._children = dict()
@@ -985,6 +1062,14 @@ def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR,
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
 
+    if _in_eager_mode():
+        eager_tensor = core.eager.EagerTensor(
+            dtype if dtype else core.VarDesc.VarType.FP32,
+            list(shape) if shape else [], name, type
+            if type else core.VarDesc.VarType.LOD_TENSOR, True
+            if persistable else False)
+        eager_tensor.retain_grads()
+        return eager_tensor
     return core.VarBase(dtype if dtype else core.VarDesc.VarType.FP32,
                         list(shape) if shape else [], name, type
                         if type else core.VarDesc.VarType.LOD_TENSOR, True
@@ -996,6 +1081,8 @@ class VariableMetaClass(type):
     def __instancecheck__(cls, instance):
         t = type(instance)
         if in_dygraph_mode():
+            if _in_eager_mode():
+                return issubclass(t, core.eager.EagerTensor)
             return issubclass(t, core.VarBase)
         else:
             return issubclass(t, Variable)
@@ -1006,6 +1093,8 @@ class ParameterMetaClass(VariableMetaClass):
     def __instancecheck__(cls, instance):
         t = type(instance)
         if in_dygraph_mode():
+            if _in_eager_mode():
+                return issubclass(t, EagerParamBase)
             return issubclass(t, ParamBase)
         else:
             return issubclass(t, Parameter)
@@ -1461,6 +1550,33 @@ def to_string(self, throw_on_error, with_details=False):
 
     __repr__ = __str__
 
+    def element_size(self):
+        """
+        Returns the size in bytes of an element in the Tensor.
+        
+        Examples:
+          .. code-block:: python
+
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x1', shape=[3, 2], dtype='bool')
+            x.element_size() # 1
+
+            x = paddle.static.data(name='x2', shape=[3, 2], dtype='int16')
+            x.element_size() # 2
+
+            x = paddle.static.data(name='x3', shape=[3, 2], dtype='float16')
+            x.element_size() # 2
+
+            x = paddle.static.data(name='x4', shape=[3, 2], dtype='float32')
+            x.element_size() # 4
+
+            x = paddle.static.data(name='x5', shape=[3, 2], dtype='float64')
+            x.element_size() # 8
+        """
+        return self.desc.element_size()
+
     @property
     def stop_gradient(self):
         """
@@ -2112,6 +2228,10 @@ def set_value(self, value, scope=None):
             p = core.Place()
             p.set_place(t._place())
             place = core.NPUPlace(p.npu_device_id())
+        elif p.is_mlu_place():
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.MLUPlace(p.mlu_device_id())
         else:
             p = core.Place()
             p.set_place(t._place())
@@ -3291,7 +3411,10 @@ def create_parameter(self, *args, **kwargs):
         global_block = self.program.global_block()
         param = None
         if in_dygraph_mode():
-            param = ParamBase(*args, **kwargs)
+            if _in_eager_mode():
+                param = EagerParamBase(*args, **kwargs)
+            else:
+                param = ParamBase(*args, **kwargs)
         else:
             param = Parameter(global_block, *args, **kwargs)
 
@@ -3336,6 +3459,7 @@ def append_op(self, *args, **kwargs):
         """
         if in_dygraph_mode():
             attrs = kwargs.get("attrs", {})
+            inplace_map = kwargs.get("inplace_map", None)
             type = kwargs.get("type", None)
             op = Operator(
                 block=self,
@@ -3354,7 +3478,8 @@ def append_op(self, *args, **kwargs):
                                        kwargs.get("inputs", {}),
                                        kwargs.get("outputs", {}), attrs
                                        if attrs else {},
-                                       kwargs.get("stop_gradient", False))
+                                       kwargs.get("stop_gradient", False),
+                                       inplace_map)
         else:
             from paddle.fluid.dygraph.base import param_guard
 
@@ -6288,6 +6413,153 @@ def _copy_to(self, device, blocking):
     __repr__ = __str__
 
 
+if hasattr(core, "eager"):
+    _core_eager_eagertensor = core.eager.EagerTensor
+else:
+    _core_eager_eagertensor = object
+
+
+class EagerParamBase(_core_eager_eagertensor):
+    """
+    EagerParamBase is derived from Tensor( Which is the concept in Eager-Dygraph Mode). 
+    A EagerParamBase is a persistable Tensor, and will be updated by optimizers 
+    after each iteration.
+    The training of a neural network is essentially the updating of
+    its EagerParamBase.
+
+    Relative to a general Tensor, a EagerParamBase has several its own
+    member variables:
+
+    Args:
+        trainable(bool): True if the EagerParamBase need to be updated after
+            iterations.
+        optimize_attr(map): EagerParamBase attributes related with optimizing.
+            Currently, it only contains 'learning_rate'.
+            Default: {'learning_rate': 1.0}
+        regularizer(WeightDecayRegularizer): The Regularizer which will
+            be applied on the EagerParamBase. Default: None
+        do_model_average(bool): True if the model average strategy will
+            be applied on this EagerParamBase.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
+    """
+
+    @dygraph_only
+    def __init__(self, shape, dtype, **kwargs):
+        if shape is None:
+            raise ValueError("The shape of Parameter should not be None")
+        if dtype is None:
+            raise ValueError("The dtype of Parameter should not be None")
+
+        if len(shape) == 0:
+            raise ValueError(
+                "The dimensions of shape for Parameter must be greater than 0")
+
+        for each in shape:
+            if each < 0:
+                raise ValueError(
+                    "Each dimension of shape for Parameter must be greater than 0, but received %s"
+                    % list(shape))
+
+        if dtype is not None:
+            if not isinstance(dtype, core.VarDesc.VarType):
+                dtype = convert_np_dtype_to_dtype_(dtype)
+
+        name = kwargs.get('name', unique_name.generate('_eager_param_base'))
+
+        super(EagerParamBase, self).__init__(
+            dtype if dtype else core.VarDesc.VarType.FP32,
+            list(shape)
+            if shape else [], name, core.VarDesc.VarType.LOD_TENSOR, True)
+        self.retain_grads()
+
+        trainable = kwargs.get('trainable', True)
+        self.stop_gradient = not trainable
+
+        self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
+
+        self.regularizer = kwargs.get('regularizer', None)
+
+        self.do_model_average = kwargs.get('do_model_average', None)
+
+        self.need_clip = kwargs.get('need_clip', True)
+
+        self.is_distributed = kwargs.get('is_distributed', False)
+        # self.block = default_main_program().global_block()
+
+    @property
+    def trainable(self):
+        return not self.stop_gradient
+
+    @trainable.setter
+    def trainable(self, trainable):
+        if isinstance(trainable, bool):
+            self.stop_gradient = not trainable
+        else:
+            raise ValueError(
+                "The type of trainable MUST be bool, but the type is ",
+                type(trainable))
+
+    def __str__(self):
+        """
+        Convert a EagerParamBase object to a readable string.
+
+        Returns(str): A readable string.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                linear = paddle.nn.Linear(3, 3)
+                print(linear.weight)
+                # Parameter containing:
+                # Tensor(shape=[3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #        [[ 0.48948765,  0.05829060, -0.25524026],
+                #         [-0.70368278,  0.52986908, -0.68742192],
+                #         [-0.54217887,  0.48439729,  0.34082305]])
+        """
+        return "Parameter containing:\n{tensor}".format(
+            tensor=super(EagerParamBase, self).__str__())
+
+    def __deepcopy__(self, memo):
+        """
+        Deep copy parameter, it will always performs Tensor copy.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import copy
+                linear = paddle.nn.Linear(1, 3)
+                linear_copy = copy.deepcopy(linear)
+
+                print(linear.weight)
+                # Parameter containing:
+                # Tensor(shape=[1, 3], dtype=float32, place=CPUPlace, stop_gradient=False,
+                #     [[-0.30929261, -0.90929240, -1.07851017]])
+
+                print(linear_copy.weight)
+                # Parameter containing:
+                # Tensor(shape=[1, 3], dtype=float32, place=CPUPlace, stop_gradient=False,
+                #     [[-0.30929261, -0.90929240, -1.07851017]])
+
+        """
+        state = copy.deepcopy(self.__dict__, memo)
+        state["name"] = self.name + unique_name.generate("_deepcopy")
+        new_param = EagerParamBase(self.shape, self.dtype, **state)
+        memo[id(self)] = new_param
+        new_param.copy_(self, True)
+        return new_param
+
+    def _copy_to(self, device, blocking):
+        state = copy.deepcopy(self.__dict__)
+        new_param = EagerParamBase(self.shape, self.dtype, **state)
+        core.eager.tensor_copy(self, new_param, device, blocking)
+        return new_param
+
+    __repr__ = __str__
+
+
 # program is a global instance.
 _main_program_ = Program()
 _startup_program_ = Program()
@@ -6648,7 +6920,7 @@ def _get_paddle_place(place):
         return place
     if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
                           core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace,
-                          core.IPUPlace)):
+                          core.MLUPlace, core.IPUPlace)):
         return place
 
     if not isinstance(place, str):
@@ -6703,6 +6975,18 @@ def _get_paddle_place(place):
         device_id = int(device_id)
         return core.NPUPlace(device_id)
 
+    # MLU
+    avaliable_mlu_place = re.match(r'mlu:\d+', place)
+    if avaliable_mlu_place:
+        if not core.is_compiled_with_mlu():
+            raise ValueError(
+                "The device should not be {}, since PaddlePaddle is " \
+                "not compiled with MLU".format(avaliable_mlu_place))
+        place_info_list = place.split(':', 1)
+        device_id = place_info_list[1]
+        device_id = int(device_id)
+        return core.MLUPlace(device_id)
+
     # IPU
     avaliable_ipu_place = re.match(r'ipu:\d+', place)
     if avaliable_ipu_place:
@@ -6716,7 +7000,7 @@ def _get_paddle_place(place):
         return core.IPUPlace(device_id)
 
     raise ValueError(
-        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, NPUPlace and IPUPlace, but received {}.".
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, MLUPlace, MLUPlace and IPUPlace, but received {}.".
         format(place))
 
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 3b4a3aacc06c6..5d7dacc007e6b 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -94,7 +94,8 @@ def __init__(self, optimizer):
             ".batch_size@GRAD", ".batch_square_sum@GRAD", ".batch_sum@GRAD"
         ]
         self.supported_embedding_types = [
-            "lookup_table", "pull_sparse", "pull_sparse_v2", "pull_box_sparse"
+            "lookup_table", "pull_sparse", "pull_sparse_v2", "pull_box_sparse",
+            "pull_gpups_sparse"
         ]
         self.supported_embedding_grad_types = [
             "lookup_table_grad", "push_sparse", "push_sparse_v2"
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index a7631848cd38c..6ef3646a91943 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -22,6 +22,7 @@
 from .core import VarDesc
 from . import unique_name
 from .data_feeder import check_variable_and_dtype, check_type, check_dtype
+from paddle import _C_ops
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
@@ -132,47 +133,33 @@ def __call__(self, var, block=None):
         """
         block = self._check_block(block)
 
-        assert isinstance(var, framework.Variable)
+        assert (isinstance(var, framework.Variable) or
+                isinstance(var, framework.EagerParamBase))
         assert isinstance(block, framework.Block)
 
-        # to be compatible of fp16 initializers
-        if var.dtype == VarDesc.VarType.FP16:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['constant_init', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
+        if framework.in_dygraph_mode():
+            var = _C_ops.fill_constant(
+                var, 'value',
+                float(self._value), 'force_cpu', self._force_cpu, 'dtype',
+                int(var.dtype), 'str_value',
+                str(float(self._value)), 'shape', var.shape)
+            return None
         else:
-            out_dtype = var.dtype
-            out_var = var
-
-        # fill constant should set the "str_value" to preserve precision
-        op = block.append_op(
-            type="fill_constant",
-            outputs={"Out": out_var},
-            attrs={
-                "shape": var.shape,
-                "dtype": int(out_dtype),
-                "value": float(self._value),
-                'str_value': str(float(self._value)),
-                'force_cpu': self._force_cpu
-            },
-            stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
+            # fill constant should set the "str_value" to preserve precision
+            op = block.append_op(
+                type="fill_constant",
                 outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "value": float(self._value),
+                    'str_value': str(float(self._value)),
+                    'force_cpu': self._force_cpu
+                },
+                stop_gradient=True)
 
-        if not framework.in_dygraph_mode():
             var.op = op
-        return op
+            return op
 
 
 class UniformInitializer(Initializer):
@@ -257,33 +244,45 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        op = block.append_op(
-            type="uniform_random",
-            inputs={},
-            outputs={"Out": out_var},
-            attrs={
-                "shape": var.shape,
-                "dtype": out_dtype,
-                "min": self._low,
-                "max": self._high,
-                "seed": self._seed,
-                "diag_num": self._diag_num,
-                "diag_step": self._diag_step,
-                "diag_val": self._diag_val
-            },
-            stop_gradient=True)
+        if framework.in_dygraph_mode():
+            out_var = _C_ops.uniform_random(
+                'shape', var.shape, 'min', self._low, 'max', self._high, 'seed',
+                self._seed, 'dtype', out_dtype, 'diag_num', self._diag_num,
+                'diag_step', self._diag_step, 'diag_val', self._diag_val)
+            if var.dtype == VarDesc.VarType.FP16:
+                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                      'out_dtype', var.dtype)
+                var.copy_(var_tmp, False)
+            else:
+                var.copy_(out_var, False)
+            return None
+        else:
+            op = block.append_op(
+                type="uniform_random",
+                inputs={},
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": out_dtype,
+                    "min": self._low,
+                    "max": self._high,
+                    "seed": self._seed,
+                    "diag_num": self._diag_num,
+                    "diag_step": self._diag_step,
+                    "diag_val": self._diag_val
+                },
+                stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
+            if var.dtype == VarDesc.VarType.FP16:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype,
+                           "out_dtype": var.dtype})
 
-        if not framework.in_dygraph_mode():
             var.op = op
-        return op
+            return op
 
 
 class NormalInitializer(Initializer):
@@ -335,26 +334,12 @@ def __call__(self, var, block=None):
         if self._seed == 0:
             self._seed = block.program.random_seed
 
-        # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(
-                name=unique_name.generate(".".join(
-                    ['gaussian_random', var.name, 'tmp'])),
-                shape=var.shape,
-                dtype=out_dtype,
-                type=VarDesc.VarType.LOD_TENSOR,
-                persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
         op = block.append_op(
             type="gaussian_random",
-            outputs={"Out": out_var},
+            outputs={"Out": var},
             attrs={
                 "shape": var.shape,
-                "dtype": out_dtype,
+                "dtype": var.dtype,
                 "mean": self._mean,
                 "std": self._std_dev,
                 "seed": self._seed,
@@ -362,16 +347,11 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
         if not framework.in_dygraph_mode():
             var.op = op
-        return op
+            return op
+        else:
+            return None
 
 
 class TruncatedNormalInitializer(Initializer):
@@ -433,28 +413,39 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        op = block.append_op(
-            type="truncated_gaussian_random",
-            outputs={"Out": out_var},
-            attrs={
-                "shape": var.shape,
-                "dtype": out_dtype,
-                "mean": self._mean,
-                "std": self._std_dev,
-                "seed": self._seed
-            },
-            stop_gradient=True)
+        if framework.in_dygraph_mode():
+            out_var = _C_ops.truncated_gaussian_random(
+                'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean,
+                'std', self._std_dev, 'seed', self._seed)
+            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                      'out_dtype', var.dtype)
+                var.copy_(var_tmp, False)
+            else:
+                var.copy_(out_var, False)
+            return None
+        else:
+            op = block.append_op(
+                type="truncated_gaussian_random",
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": out_dtype,
+                    "mean": self._mean,
+                    "std": self._std_dev,
+                    "seed": self._seed
+                },
+                stop_gradient=True)
 
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
-        if not framework.in_dygraph_mode():
+            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype,
+                           "out_dtype": var.dtype})
             var.op = op
-        return op
+            return op
 
 
 class XavierInitializer(Initializer):
@@ -553,47 +544,66 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        if self._uniform:
-            limit = np.sqrt(6.0 / float(fan_in + fan_out))
-            op = block.append_op(
-                type="uniform_random",
-                inputs={},
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": out_var.shape,
-                    "dtype": out_dtype,
-                    "min": -limit,
-                    "max": limit,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
-
+        if framework.in_dygraph_mode():
+            if self._uniform:
+                limit = np.sqrt(6.0 / float(fan_in + fan_out))
+                out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
+                                                -limit, 'max', limit, 'seed',
+                                                self._seed, 'dtype', out_dtype)
+            else:
+                std = np.sqrt(2.0 / float(fan_in + fan_out))
+                out_var = _C_ops.gaussian_random(
+                    'shape', out_var.shape, 'dtype', out_dtype, 'mean', 0.0,
+                    'std', std, 'seed', self._seed)
+
+            if var.dtype == VarDesc.VarType.FP16 or (
+                    var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                      'out_dtype', var.dtype)
+                var.copy_(var_tmp, False)
+            else:
+                var.copy_(out_var, False)
+            return None
         else:
-            std = np.sqrt(2.0 / float(fan_in + fan_out))
-            op = block.append_op(
-                type="gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": out_var.shape,
-                    "dtype": out_dtype,
-                    "mean": 0.0,
-                    "std": std,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
+            if self._uniform:
+                limit = np.sqrt(6.0 / float(fan_in + fan_out))
+                op = block.append_op(
+                    type="uniform_random",
+                    inputs={},
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": out_dtype,
+                        "min": -limit,
+                        "max": limit,
+                        "seed": self._seed
+                    },
+                    stop_gradient=True)
+            else:
+                std = np.sqrt(2.0 / float(fan_in + fan_out))
+                op = block.append_op(
+                    type="gaussian_random",
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": out_dtype,
+                        "mean": 0.0,
+                        "std": std,
+                        "seed": self._seed
+                    },
+                    stop_gradient=True)
+
+            if var.dtype == VarDesc.VarType.FP16 or (
+                    var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype,
+                           "out_dtype": var.dtype})
 
-        if not framework.in_dygraph_mode():
             var.op = op
-        return op
+            return op
 
 
 class MSRAInitializer(Initializer):
@@ -686,47 +696,68 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        if self._uniform:
-            limit = np.sqrt(6.0 / float(fan_in))
-            op = block.append_op(
-                type="uniform_random",
-                inputs={},
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": out_var.shape,
-                    "dtype": int(out_dtype),
-                    "min": -limit,
-                    "max": limit,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
-
+        if framework.in_dygraph_mode():
+            if self._uniform:
+                limit = np.sqrt(6.0 / float(fan_in))
+                out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
+                                                -limit, 'max', limit, 'seed',
+                                                self._seed, 'dtype',
+                                                int(out_dtype))
+            else:
+                std = np.sqrt(2.0 / float(fan_in))
+                out_var = _C_ops.gaussian_random(
+                    'shape', out_var.shape, 'dtype',
+                    int(out_dtype), 'mean', 0.0, 'std', std, 'seed', self._seed)
+
+            if var.dtype == VarDesc.VarType.FP16 or (
+                    var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                      'out_dtype', var.dtype)
+                var.copy_(var_tmp, False)
+            else:
+                var.copy_(out_var, False)
+            return None
         else:
-            std = np.sqrt(2.0 / float(fan_in))
-            op = block.append_op(
-                type="gaussian_random",
-                outputs={"Out": out_var},
-                attrs={
-                    "shape": out_var.shape,
-                    "dtype": int(out_dtype),
-                    "mean": 0.0,
-                    "std": std,
-                    "seed": self._seed
-                },
-                stop_gradient=True)
-
-        if var.dtype == VarDesc.VarType.FP16 or (
-                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
+            if self._uniform:
+                limit = np.sqrt(6.0 / float(fan_in))
+                op = block.append_op(
+                    type="uniform_random",
+                    inputs={},
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": int(out_dtype),
+                        "min": -limit,
+                        "max": limit,
+                        "seed": self._seed
+                    },
+                    stop_gradient=True)
+
+            else:
+                std = np.sqrt(2.0 / float(fan_in))
+                op = block.append_op(
+                    type="gaussian_random",
+                    outputs={"Out": out_var},
+                    attrs={
+                        "shape": out_var.shape,
+                        "dtype": int(out_dtype),
+                        "mean": 0.0,
+                        "std": std,
+                        "seed": self._seed
+                    },
+                    stop_gradient=True)
+
+            if var.dtype == VarDesc.VarType.FP16 or (
+                    var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype,
+                           "out_dtype": var.dtype})
 
-        if not framework.in_dygraph_mode():
             var.op = op
-        return op
+            return op
 
 
 class BilinearInitializer(Initializer):
@@ -839,28 +870,44 @@ def __call__(self, var, block=None):
 
         if np.prod(shape) > 1024 * 1024:
             raise ValueError("The size of input is too big. ")
-        op = block.append_op(
-            type='assign_value',
-            outputs={'Out': [out_var]},
-            attrs={
-                'dtype': out_dtype,
-                'shape': list(shape),
-                value_name: values
-            })
 
-        if var.dtype in [
-                VarDesc.VarType.FP16, VarDesc.VarType.BF16, VarDesc.VarType.FP64
-        ]:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
+        if framework.in_dygraph_mode():
+            out_var = _C_ops.assign_value('shape',
+                                          list(shape), 'dtype', out_dtype,
+                                          value_name, values)
+            if var.dtype in [
+                    VarDesc.VarType.FP16, VarDesc.VarType.BF16,
+                    VarDesc.VarType.FP64
+            ]:
+                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                      'out_dtype', var.dtype)
+                var.copy_(var_tmp, False)
+            else:
+                var.copy_(out_var, False)
+            return None
+        else:
+            op = block.append_op(
+                type='assign_value',
+                outputs={'Out': [out_var]},
+                attrs={
+                    'dtype': out_dtype,
+                    'shape': list(shape),
+                    value_name: values
+                })
+
+            if var.dtype in [
+                    VarDesc.VarType.FP16, VarDesc.VarType.BF16,
+                    VarDesc.VarType.FP64
+            ]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype,
+                           "out_dtype": var.dtype})
 
-        if not framework.in_dygraph_mode():
             var.op = op
-        return op
+            return op
 
 
 class NumpyArrayInitializer(Initializer):
@@ -932,27 +979,39 @@ def __call__(self, var, block=None):
         if self._value.size > 1024 * 1024 * 1024:
             raise ValueError("The size of input is too big. Please consider "
                              "saving it to file and 'load_op' to load it")
-        op = block.append_op(
-            type='assign_value',
-            outputs={'Out': out_var},
-            attrs={
-                'dtype': out_dtype,
-                'shape': list(self._value.shape),
-                value_name: values
-            },
-            stop_gradient=True)
 
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            block.append_op(
-                type="cast",
-                inputs={"X": out_var},
-                outputs={"Out": var},
-                attrs={"in_dtype": out_var.dtype,
-                       "out_dtype": var.dtype})
+        if framework.in_dygraph_mode():
+            out_var = _C_ops.assign_value('shape',
+                                          list(self._value.shape), 'dtype',
+                                          out_dtype, value_name, values)
+            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+                var_tmp = _C_ops.cast(out_var, 'in_dtype', out_var.dtype,
+                                      'out_dtype', var.dtype)
+                var.copy_(var_tmp, False)
+            else:
+                var.copy_(out_var, False)
+            return None
+        else:
+            op = block.append_op(
+                type='assign_value',
+                outputs={'Out': out_var},
+                attrs={
+                    'dtype': out_dtype,
+                    'shape': list(self._value.shape),
+                    value_name: values
+                },
+                stop_gradient=True)
+
+            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
+                block.append_op(
+                    type="cast",
+                    inputs={"X": out_var},
+                    outputs={"Out": var},
+                    attrs={"in_dtype": out_var.dtype,
+                           "out_dtype": var.dtype})
 
-        if not framework.in_dygraph_mode():
             var.op = op
-        return op
+            return op
 
 
 def set_global_initializer(weight_init, bias_init=None):
@@ -1032,16 +1091,17 @@ def _global_bias_initializer():
 
 def calculate_gain(nonlinearity, param=None):
     """
-    Get the recommended gain value of some nonlinearity function.
+    Get the recommended ``gain`` value of some nonlinearity function. ``gain`` value can be used in some 
+    ``paddle.nn.initializer`` api to adjust the initialization value.
 
     Args:
-        nonlinearity(str): name of nonlinearity activation function. If it is a linear function, which is one of 
-        "linear/conv1d/conv2d/conv3d/conv1d_transpose/conv2d_transpose/conv3d_transpose" , will return 1.0
+        nonlinearity(str): name of nonlinearity activation function. If it is a linear function, such as: 
+            `linear/conv1d/conv2d/conv3d/conv1d_transpose/conv2d_transpose/conv3d_transpose` , 1.0 will be returned.
         param(bool|int|float, optional): optional parameter for somme nonlinearity function. Now, it only applies to 
-        'leaky_relu'. Default: None, it will be calculated as 0.01 in the formula.
+            'leaky_relu'. Default: None, it will be calculated as 0.01 in the formula.
 
     Returns:
-        The recommended gain value for nonlinearity function.
+        A float value, which is the recommended gain for this nonlinearity function.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index e110c47d790f1..4bbc0ba03c934 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -2100,6 +2100,10 @@ def set_var(var, ndarray):
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
             place = paddle.fluid.NPUPlace(p.npu_device_id())
+        elif p.is_mlu_place():
+            p = paddle.fluid.core.Place()
+            p.set_place(t._place())
+            place = paddle.fluid.MLUPlace(p.mlu_device_id())
         else:
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
@@ -2394,6 +2398,10 @@ def set_program_state(program, state_dict):
                 p = paddle.fluid.core.Place()
                 p.set_place(ten_place)
                 py_place = paddle.fluid.NPUPlace(p.npu_device_id())
+            elif ten_place.is_mlu_place():
+                p = paddle.fluid.core.Place()
+                p.set_place(ten_place)
+                py_place = paddle.fluid.MLUPlace(p.mlu_device_id())
 
             ten.set(new_para_np, py_place)
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 2b677c11e9d96..72cdd1f9ad5ff 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -24,6 +24,7 @@
 from . import core
 from six.moves import zip
 from .layer_helper_base import LayerHelperBase
+from .dygraph_utils import _append_activation_in_dygraph
 
 
 class LayerHelper(LayerHelperBase):
@@ -145,21 +146,27 @@ def append_activation(self, input_var):
         else:
             raise TypeError(str(act) + " should be unicode or str")
 
+        use_cudnn = None
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
-            act['use_cudnn'] = self.kwargs.get('use_cudnn')
+            use_cudnn = self.kwargs.get('use_cudnn')
+            act['use_cudnn'] = use_cudnn
         use_mkldnn = self.kwargs.get(
             'use_mkldnn', _global_flags().get("FLAGS_use_mkldnn", False))
         if use_mkldnn:
             act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
-
-        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
-        self.append_op(
-            type=act_type,
-            inputs={"X": [input_var]},
-            outputs={"Out": [tmp]},
-            attrs=act)
-        return tmp
+        if in_dygraph_mode():
+            res = _append_activation_in_dygraph(input_var, act_type, use_cudnn,
+                                                use_mkldnn)
+            return res
+        else:
+            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
+            self.append_op(
+                type=act_type,
+                inputs={"X": [input_var]},
+                outputs={"Out": [tmp]},
+                attrs=act)
+            return tmp
 
     #TODO (jiabin): should we remove this since it has never be used
     def _get_default_initializer(self, dtype):
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index c2de5670eb42c..67fcd901dedc9 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -17,7 +17,7 @@
 import copy
 import numpy as np
 
-from .framework import Variable, default_main_program, default_startup_program, in_dygraph_mode, _current_expected_place
+from .framework import Variable, default_main_program, default_startup_program, in_dygraph_mode, _current_expected_place, _in_eager_mode
 from . import unique_name
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
@@ -84,13 +84,19 @@ def to_variable(self, value, name=None):
         if isinstance(value, np.ndarray):
             assert in_dygraph_mode(
             ), "to_variable could only be called in dygraph mode"
-            py_var = core.VarBase(
-                value=value,
-                name=name if name else '',
-                persistable=False,
-                place=_current_expected_place(),
-                zero_copy=False)
-            return py_var
+            if _in_eager_mode():
+                return core.eager.EagerTensor(value,
+                                              _current_expected_place(), False,
+                                              False, name
+                                              if name else None, True)
+            else:
+                py_var = core.VarBase(
+                    value=value,
+                    name=name if name else '',
+                    persistable=False,
+                    place=_current_expected_place(),
+                    zero_copy=False)
+                return py_var
         elif isinstance(value, (core.VarBase, Variable)):
             return value
         else:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c7fb75387aa31..1c357c6fa74d5 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -665,6 +665,69 @@ def _pull_sparse_v2(input,
     return outs
 
 
+def _pull_gpups_sparse(input,
+                       size,
+                       dtype='float32',
+                       is_distributed=False,
+                       is_sparse=False):
+    r"""
+    **Pull GpuPS Sparse Layer**
+
+    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
+    GpuPS lookup table. The result of this lookup is the embedding of each ID in the
+    :attr:`input`.
+
+    Args:
+        input(Variable|list of Variable): Input is a Tensor<int64> Variable, which
+            contains the IDs information.
+        size(int|list of int): The embedding size parameter of each input, which indicates the size of
+            each embedding vector respectively.
+        dtype(str): The dtype refers to the data type of output tensor. Only supports
+	    float32 now.
+
+    Returns:
+        Variable|list of Variable: The tensor variable storing the embeddings of the \
+                  supplied inputs, whose size are indicated by size respectively.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          slots = []
+          data_1 = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1)
+          slots.append(data_1)
+          data_2 = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1)
+          slots.append(data_2)
+          embs = fluid.layers.pull_gpups_sparse(input=slots, size=[11, 35])
+    """
+    helper = LayerHelper('pull_gpups_sparse', **locals())
+    if dtype != 'float32':
+        raise ValueError(
+            "GpuPS only support float type embedding now, and your type is: " +
+            dtype)
+    helper.input_dtype()
+    inputs = helper.multiple_input()
+    outs = [
+        helper.create_variable_for_type_inference(dtype)
+        for i in range(len(inputs))
+    ]
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=[11], dtype=dtype, is_bias=False)
+    helper.append_op(
+        type='pull_gpups_sparse',
+        inputs={'Ids': inputs,
+                'W': w},
+        outputs={'Out': outs},
+        attrs={
+            'size': size,
+            'is_distributed': is_distributed,
+            'is_sparse': is_sparse
+        })
+    if len(outs) == 1:
+        return outs[0]
+    return outs
+
+
 def _pull_box_sparse(input,
                      size,
                      dtype='float32',
@@ -9864,7 +9927,7 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
         #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
         #NOTE(GuoxiaWang): support NHWC data format
         if data_format == 'NHWC':
-            alpha_shape = [1, 1, 1, x.shape[1]]
+            alpha_shape = [1, 1, 1, x.shape[-1]]
         else:
             alpha_shape = [1, x.shape[1], 1, 1]
 
@@ -15128,6 +15191,9 @@ def mish(x, threshold=20, name=None):
         out, = exe.run(feed={'x':x_data}, fetch_list=[y.name])
         print(out)  # [[0.66666667, 1.66666667, 3., 4.]]
     """
+    if in_dygraph_mode():
+        return _C_ops.mish(x, 'threshold', threshold)
+
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'mish')
     check_type(threshold, 'threshold', (float, int), 'mish')
     assert threshold > 0, "threshold of mish should be greater than 0, " \
@@ -15139,7 +15205,7 @@ def mish(x, threshold=20, name=None):
         type='mish',
         inputs={'X': x},
         outputs={'Out': out},
-        attrs={'threshold': threshold or -1})
+        attrs={'threshold': threshold})
     return out
 
 
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index f877a00c51127..b8fb17a8028c7 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -363,7 +363,7 @@ def convert_shape_to_list(shape):
     """
     if isinstance(shape, (list, tuple)):
         shape = list(
-            map(lambda x: x.numpy()[0] if isinstance(x, Variable) else x,
+            map(lambda x: x.numpy().flat[0] if isinstance(x, Variable) else x,
                 shape))
     else:
         shape = shape.numpy().astype(int).tolist()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index ae2c87938c682..ee01036ca931f 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1296,6 +1296,7 @@ def __init__(self,
                  parameter_list=None,
                  regularization=None,
                  grad_clip=None,
+                 multi_precision=False,
                  name=None):
         assert learning_rate is not None
         super(SGDOptimizer, self).__init__(
@@ -1306,26 +1307,86 @@ def __init__(self,
             name=name)
         self.type = "sgd"
         self._use_mkldnn = False
+        self._multi_precision = multi_precision
+        self._master_weights = {}
+
+    def _create_master_weight(self, param):
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
+        else:
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
+        return var
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+                master_p = self._create_master_weight(p)
+                continue
+            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+                warnings.warn(
+                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Consider using multi_precision=True option of the Adam optimizer."
+                )
 
     @no_grad
     def _append_optimize_op(self, block, param_and_grad):
+
+        find_master = self._multi_precision and param_and_grad[
+            0].dtype == core.VarDesc.VarType.FP16
+        master_weight = (self._master_weights[param_and_grad[0].name]
+                         if find_master else None)
+
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
-            _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1],
-                       param_and_grad[0])
+            _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight,
+                       param_and_grad[0], master_weight)
             return None
 
         assert isinstance(block, framework.Block)
         # create the optimize op
+        inputs = {
+            "Param": param_and_grad[0],
+            "Grad": param_and_grad[1],
+            "LearningRate": lr
+        }
+
+        outputs = {"ParamOut": param_and_grad[0]}
+
+        attrs = {"multi_precision": find_master}
+
+        if find_master:
+            inputs["MasterParam"] = master_weight
+            outputs["MasterParamOut"] = master_weight
+
         sgd_op = block.append_op(
             type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": lr
-            },
-            attrs={"use_mkldnn": self._use_mkldnn},
-            outputs={"ParamOut": param_and_grad[0]},
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
             stop_gradient=True)
 
         return sgd_op
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index fc48a48450efd..183a00bd70bdf 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -18,6 +18,7 @@
 from .wrapped_decorator import signature_safe_contextmanager
 import os
 import six
+import sys
 
 __all__ = [
     'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
@@ -355,3 +356,33 @@ def profiler(state,
         yield
     finally:
         stop_profiler(sorted_key, profile_path)
+
+
+@signature_safe_contextmanager
+def _nvprof_range(iter_id, start, end, exit_after_prof=True):
+    '''
+    A range profiler interface (not public yet).
+
+    Examples:
+
+        .. code-block:: python
+            
+            model = Model()
+            for i in range(max_iter):
+                paddle.fluid.profiler._nvprof_range(i, 10, 20):
+                    out = model(in)
+    '''
+    try:
+        if iter_id == start:
+            core.nvprof_start()
+            core.nvprof_enable_record_event()
+        if iter_id >= start:
+            core.nvprof_nvtx_push(str(iter_id))
+        yield
+    finally:
+        if iter_id < end:
+            core.nvprof_nvtx_pop()
+        if iter_id == end:
+            core.nvprof_stop()
+            if exit_after_prof:
+                sys.exit()
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index b2ef90bf87a1a..c5ec3191c1b02 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -105,3 +105,49 @@ PD_BUILD_GRAD_OP(custom_relu)
     .Inputs({"X", "Out", paddle::Grad("Out")})
     .Outputs({paddle::Grad("X")})
     .SetKernelFn(PD_KERNEL(ReluBackward));
+
+std::vector<paddle::Tensor> relu_cpu_backward_without_x(
+    const paddle::Tensor& out, const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(out.place()),
+                                   out.size());
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_cuda_backward_without_x(
+    const paddle::Tensor& out, const paddle::Tensor& grad_out);
+
+std::vector<paddle::Tensor> ReluBackwardWithoutX(
+    const paddle::Tensor& out, const paddle::Tensor& grad_out) {
+  if (out.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_backward_without_x(out, grad_out);
+  } else if (out.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_backward_without_x(out, grad_out);
+  } else {
+    PD_THROW("Not implemented.");
+  }
+}
+
+std::vector<std::vector<int64_t>> ReluBackwardWithoutXInferShape(
+    const std::vector<int64_t>& out_shape,
+    const std::vector<int64_t>& grad_out_shape) {
+  return {out_shape};
+}
+
+PD_BUILD_OP(custom_relu_no_x_in_backward)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu_no_x_in_backward)
+    .Inputs({"Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackwardWithoutX))
+    .SetInferShapeFn(PD_INFER_SHAPE(ReluBackwardWithoutXInferShape));
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index dda42a5c05984..637deeb90569c 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -70,3 +70,22 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
 
   return {grad_x};
 }
+
+std::vector<paddle::Tensor> relu_cuda_backward_without_x(
+    const paddle::Tensor& out, const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, out.shape());
+
+  int numel = out.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      out.type(), "relu_cuda_backward_kernel", ([&] {
+        relu_cuda_backward_kernel<data_t><<<grid, block, 0, out.stream()>>>(
+            grad_out.data<data_t>(),
+            out.data<data_t>(),
+            grad_x.mutable_data<data_t>(out.place()),
+            numel);
+      }));
+
+  return {grad_x};
+}
diff --git a/python/paddle/fluid/tests/custom_op/custom_simple_slice_op.cc b/python/paddle/fluid/tests/custom_op/custom_simple_slice_op.cc
new file mode 100644
index 0000000000000..783e0cd96fdd9
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/custom_simple_slice_op.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+#define CHECK_INPUT(x) \
+  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+
+std::vector<paddle::Tensor> SimpleSliceFunction(const paddle::Tensor& x,
+                                                int64_t begin_index,
+                                                int64_t end_index) {
+  return {x.slice(begin_index, end_index)};
+}
+
+std::vector<std::vector<int64_t>> SimpleSliceInferShape(
+    const std::vector<int64_t>& x_shape,
+    int64_t begin_index,
+    int64_t end_index) {
+  PD_CHECK(begin_index > 0, "The begin index is out of bound.");
+  PD_CHECK(end_index > 0, "The end index must is out of bound.");
+  PD_CHECK(begin_index < end_index,
+           "The begin index is greater than end index.");
+  auto out_shape = x_shape;
+  out_shape[0] = end_index - begin_index;
+  return {out_shape};
+}
+
+PD_BUILD_OP(custom_simple_slice)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .Attrs({"begin_index: int64_t", "end_index: int64_t"})
+    .SetKernelFn(PD_KERNEL(SimpleSliceFunction))
+    .SetInferShapeFn(PD_INFER_SHAPE(SimpleSliceInferShape));
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 4f075066b9d93..16458841f4488 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -49,7 +49,8 @@
 class TestJITLoad(unittest.TestCase):
     def setUp(self):
         self.custom_ops = [
-            custom_module.custom_relu, custom_module.custom_relu_dup
+            custom_module.custom_relu, custom_module.custom_relu_dup,
+            custom_module.custom_relu_no_x_in_backward
         ]
         self.dtypes = ['float32', 'float64']
         if paddle.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
new file mode 100644
index 0000000000000..c60bac4060b64
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtaina copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+
+# Because Windows don't use docker, the shared lib already exists in the
+# cache dir, it will not be compiled again unless the shared lib is removed.
+file = '{}\\custom_simple_slice\\custom_simple_slice.pyd'.format(
+    get_build_directory())
+if os.name == 'nt' and os.path.isfile(file):
+    cmd = 'del {}'.format(file)
+    run_cmd(cmd, True)
+
+custom_ops = load(
+    name='custom_simple_slice_jit',
+    sources=['custom_simple_slice_op.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cxx_cflags=extra_cc_args,  # test for cc flags
+    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
+    verbose=True)
+
+
+class TestCustomSimpleSliceJit(unittest.TestCase):
+    def test_slice_output(self):
+        np_x = np.random.random((5, 2)).astype("float32")
+        x = paddle.to_tensor(np_x)
+        custom_op_out = custom_ops.custom_simple_slice(x, 2, 3)
+        np_out = np_x[2:3]
+        self.assertTrue(
+            np.array_equal(custom_op_out, np_out),
+            "custom op: {},\n numpy: {}".format(np_out, custom_op_out.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 399ec47e6d617..a438eb1aa0d69 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -772,6 +772,7 @@ add_subdirectory(sequence)
 add_subdirectory(dygraph_to_static)
 add_subdirectory(rnn)
 add_subdirectory(autograd)
+add_subdirectory(distribution)
 
 if (NOT WIN32 OR NOT WITH_GPU)
     add_subdirectory(fft)
@@ -893,7 +894,7 @@ set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT
 set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 150)
+set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 200)
 set_tests_properties(test_empty_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
new file mode 100644
index 0000000000000..dd609d3ae2e11
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import os
+import sys
+from paddle.static import sparsity
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
+if cuda_visible_devices is None or cuda_visible_devices == "":
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+else:
+    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
+
+paddle.enable_static()
+
+
+class TestFleetWithASPSharding(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        os.environ['FLAGS_enable_parallel_graph'] = "0"
+        os.environ['FLAGS_fraction_of_gpu_memory_to_use'] = "0.1"
+        os.environ['FLAGS_sync_nccl_allreduce'] = "1"
+        os.environ['FLAGS_eager_delete_tensor_gb'] = "0"
+        os.environ['FLAGS_fuse_parameter_memory_size'] = "32"
+        os.environ['FLAGS_fuse_parameter_groups_size'] = "50"
+        os.environ['FLAGS_check_nan_inf'] = "0"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            input_x = paddle.static.data(
+                name="x", shape=[-1, 32], dtype='float32')
+            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            fc_3 = fluid.layers.fc(input=fc_2, size=64, act='tanh')
+            fc_4 = fluid.layers.fc(input=fc_3, size=64, act='tanh')
+            prediction = fluid.layers.fc(input=fc_4, size=2, act='softmax')
+            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+            avg_cost = paddle.mean(x=cost)
+
+            dist_strategy = paddle.distributed.fleet.DistributedStrategy()
+            dist_strategy.sharding = True
+            dist_strategy.sharding_configs = {
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 32,
+                "segment_anchors": None,
+                "sharding_degree": 8,
+                "mp_degree": 1,
+                "hybrid_dp": False,
+                "gradient_merge_acc_step": 1
+            }
+            dist_strategy.nccl_comm_num = 1
+            dist_strategy.asp = True
+        return avg_cost, dist_strategy, input_x, input_y
+
+    def test_with_asp_sharding(self):
+        if sys.platform == 'win32':
+            return
+        print(sys.platform)
+        fleet.init(is_collective=True)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy, input_x, input_y = self.net(train_prog,
+                                                        startup_prog)
+
+        with fluid.program_guard(train_prog, startup_prog):
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+        if paddle.fluid.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(
+                int(os.environ.get('FLAGS_selected_gpus', 0)))
+        else:
+            place = fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
+        exe.run(startup_prog)
+
+        sparsity.prune_model(train_prog, sharding=True)
+
+        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
+        exe.run(train_prog, feed=feeder.feed([data]))
+
+        for param in train_prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(train_prog, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
index 9ec9ccc816cf1..b9ba724f2c556 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
@@ -967,23 +967,4 @@ def forward(self, prediction_scores, masked_lm_labels, loss_mask):
         loss_mask = loss_mask.reshape([-1])
         masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)
         total_loss = masked_lm_loss / loss_mask.sum()
-        pp_total_loss = None
-        loss = total_loss
-        if "pp" in _global_parallel_strategy:
-            total_loss = total_loss
-            masked_lm_loss.persistable = True
-            total_loss.persistable = True
-            total_loss.persistable = True
-            pp_total_loss = paddle.fluid.layers.fill_constant([1, ], "float32",
-                                                              0.0)
-            pp_total_loss.persistable = True
-            block = paddle.static.default_main_program().global_block()
-            acc_steps = 1
-            tmp = total_loss / acc_steps
-            block.append_op(
-                type="elementwise_add",
-                inputs={"X": [pp_total_loss],
-                        "Y": [tmp]},
-                outputs={"Out": [pp_total_loss]})
-            loss = pp_total_loss
-        return loss
+        return total_loss
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
old mode 100644
new mode 100755
index e9146b68a9f88..48a9f7204aa8d
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -5,4 +5,5 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     list(APPEND DIST_TEST_OPS ${TEST_OP})
     set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 90)
+    set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
 endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
new file mode 100644
index 0000000000000..42bdf67824220
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import os
+import random
+import sys
+import pickle
+import shlex
+import shutil
+import inspect
+import numpy as np
+from collections import OrderedDict
+from dist_pass_test_base import DistPassTestBase
+
+import paddle.distributed.fleet as fleet
+import paddle.distributed.auto_parallel as auto
+sys.path.append("..")
+import auto_parallel_gpt_model as modeling
+from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
+
+
+class AutoPallelPassTestBase(DistPassTestBase):
+    def setUp(self):
+        paddle.enable_static()
+        seed = int(os.environ.get('SEED', -1))
+        if seed <= 0:
+            seed = np.random.randint(low=1, high=1000000, size=[1])[0]
+            os.environ['SEED'] = str(seed)
+        self.seed = seed
+        paddle.seed(self.seed)
+
+        self.rtol = 1e-5
+        self.atol = 1e-8
+        self.equal_nan = False
+
+        self.init()
+
+    def init(self):
+        pass
+
+    def get_model(self, place, **kwargs):
+        raise NotImplementedError()
+
+    def apply_passes(self):
+        raise NotImplementedError()
+
+    def apply_no_passes(self):
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.semi_auto = True
+        fleet.init(is_collective=True, strategy=dist_strategy)
+
+    def check_main(self, gpus=None, **kwargs):
+        no_pass_rets = self._distributed_launch(
+            model=None, apply_pass=False, gpus=gpus, **kwargs)
+        pass_rets = self._distributed_launch(
+            model=None, apply_pass=True, gpus=gpus, **kwargs)
+        self.check_results(no_pass_rets, pass_rets)
+
+    def _run_gpu_main(self, model, apply_pass, dump_file, **kwargs):
+        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+        place = paddle.CUDAPlace(gpu_id)
+        scope = paddle.static.Scope()
+        if apply_pass:
+            self.apply_passes()
+        else:
+            self.apply_no_passes()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            with paddle.static.scope_guard(scope):
+                with paddle.fluid.unique_name.guard():
+                    main_prog, startup_prog, inputs, outputs, reader = self.get_model(
+                        place, **kwargs)
+                    inputs = self._to_var_names(inputs)
+                    outputs = self._to_var_names(outputs)
+
+        all_fetch_values = []
+        exe = paddle.static.Executor(place)
+        with paddle.static.scope_guard(scope):
+            exe.run(startup_prog)
+            for batch_id, input_data in enumerate(reader()):
+                assert len(input_data) == len(inputs), "{} vs {}".format(
+                    len(input_data), len(inputs))
+                feed = dict(zip(inputs, input_data))
+                fetch_values = exe.run(main_prog, feed=feed, fetch_list=outputs)
+                if paddle.distributed.get_rank() == 0:
+                    output_dict = OrderedDict(zip(outputs, fetch_values))
+                    print('batch {}, outputs {}'.format(batch_id, output_dict))
+                all_fetch_values.append(fetch_values)
+        with open(dump_file, "wb") as f:
+            pickle.dump(all_fetch_values, f)
+
+    def get_gpt_model(self, strategy, place, batch_size, sequence_len,
+                      vocab_size):
+        modeling.init_global()
+        if strategy == "dp":
+            modeling._global_parallel_strategy = "dp"
+            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        elif strategy == "mp":
+            modeling._global_parallel_strategy = "mp"
+            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        elif strategy == "pp":
+            modeling._global_parallel_strategy = "pp"
+            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+            modeling.PP_MESH_LIST = [
+                auto.ProcessMesh(mesh=[0]), auto.ProcessMesh(mesh=[1])
+            ]
+        else:
+            raise ValueError("'get_gpt_model' only support dp, mp and pp.")
+
+        tokens = paddle.static.data(
+            name="tokens", shape=[batch_size, sequence_len], dtype='int64')
+        position_ids = paddle.static.data(
+            name="position_ids",
+            shape=[batch_size, sequence_len],
+            dtype='int64')
+        attention_mask = paddle.static.data(
+            name="attention_mask",
+            shape=[batch_size, 1, sequence_len, sequence_len],
+            dtype='float32')
+        labels = paddle.static.data(
+            name="labels", shape=[batch_size, sequence_len], dtype='int64')
+        loss_mask = paddle.static.data(
+            name="loss_mask", shape=[batch_size, sequence_len], dtype='float32')
+        data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
+
+        if modeling._global_parallel_strategy == "dp":
+            auto.shard_tensor(
+                tokens,
+                dist_attr={
+                    "process_mesh": modeling._global_process_mesh,
+                    "dims_mapping": [0, -1]
+                })
+        elif modeling._global_parallel_strategy == "pp":
+            auto.shard_tensor(
+                tokens,
+                dist_attr={
+                    "process_mesh": modeling.PP_MESH_LIST[0],
+                    "dims_mapping": [-1, -1]
+                })
+            auto.shard_tensor(
+                attention_mask,
+                dist_attr={
+                    "process_mesh": modeling.PP_MESH_LIST[0],
+                    "dims_mapping": [-1, -1, -1, -1]
+                })
+
+        gpt = GPTModel(
+            vocab_size=1000,
+            hidden_size=64,
+            num_hidden_layers=2,
+            num_attention_heads=8,
+            intermediate_size=256,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.0,
+            attention_probs_dropout_prob=0.0,
+            max_position_embeddings=1024,
+            type_vocab_size=1,
+            initializer_range=0.02,
+            pad_token_id=0,
+            eos_token_id=7,
+            bos_token_id=0,
+            eol_token_id=3)
+
+        model = GPTForPretraining(
+            gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02)
+        preds = model(tokens, position_ids, attention_mask)
+        criterion = GPTPretrainingCriterion()
+        loss = criterion(preds, labels, loss_mask)
+        clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=clip)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        startup_program = paddle.static.default_startup_program()
+        _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
+            loss, startup_program)
+
+        def gen_data():
+            np.random.seed(2021)
+            for _ in range(10):
+                tokens = []
+                position_ids = []
+                attention_mask = []
+                labels = []
+                loss_mask = []
+                for _ in range(batch_size):
+                    tokens.append(
+                        np.random.randint(
+                            vocab_size, size=sequence_len))
+                    position_ids.append(np.arange(sequence_len))
+                    attention_mask.append([np.tril(np.ones(sequence_len))])
+                    labels.append(
+                        np.random.randint(
+                            vocab_size, size=sequence_len))
+                    loss_mask.append(np.ones(sequence_len))
+
+                yield tokens, position_ids, attention_mask, labels, loss_mask
+
+        return dist_main_prog, dist_startup_prog, data_holder, [loss], gen_data
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
new file mode 100644
index 0000000000000..51e87260609df
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import random
+import numpy as np
+
+import unittest
+import paddle
+import paddle.nn as nn
+import paddle.distributed.fleet as fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.passes import new_pass, PassManager
+from auto_parallel_pass_test_base import AutoPallelPassTestBase
+sys.path.append("..")
+import auto_parallel_gpt_model as modeling
+from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
+
+
+class TestShardingPass(AutoPallelPassTestBase):
+    def init(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
+        self.rtol = 1e-5
+        self.atol = 1e-8
+
+        rank = paddle.distributed.get_rank()
+        paddle.seed(rank + 2021)
+        random.seed(rank + 2021)
+        np.random.seed(rank + 2021)
+
+    def apply_passes(self):
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.semi_auto = True
+        dist_strategy.sharding = True
+        dist_strategy.sharding_configs = {
+            "sharding_degree": 2,
+            "stage": 2,
+        }
+        fleet.init(is_collective=True, strategy=dist_strategy)
+
+    def apply_no_passes(self):
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.pipeline = False
+        dist_strategy.recompute = False
+        dist_strategy.semi_auto = True
+        fleet.init(is_collective=True, strategy=dist_strategy)
+
+    def test_bs_8(self):
+        self.check_main(
+            gpus=[0, 1], batch_size=8, sequence_len=512, vocab_size=1000)
+
+    def get_model(self, place, batch_size, sequence_len, vocab_size):
+        return self.get_gpt_model('dp', place, batch_size, sequence_len,
+                                  vocab_size)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
new file mode 100644
index 0000000000000..acb67e8a20c8c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+import logging
+
+import paddle
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+from paddle.fluid.initializer import NumpyArrayInitializer
+
+from paddle.distributed.passes import new_pass, PassManager, PassContext
+import paddle.distributed.fleet as fleet
+from dist_pass_test_base import DistPassTestBase
+
+logging.getLogger().setLevel(logging.INFO)
+paddle.enable_static()
+_global_parallel_strategy = None
+_global_process_mesh = None
+
+#np.set_printoptions(suppress=True)
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=128,
+                 intermediate_size=4 * 128,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        np.random.seed(2021)
+        arr0 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward))
+        arr1 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model))
+        weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0))
+        weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1))
+        bias_attr = None
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(
+            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
+        self.linear3 = nn.Linear(
+            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
+        self.linear4 = nn.Linear(
+            d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr)
+        self.linear5 = nn.Linear(
+            dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr)
+        self.norm0 = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        out = self.norm0(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        out = self.norm1(out)
+        out = self.linear2(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear3(out)
+
+        out = self.norm2(out)
+        out = self.linear4(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear5(out)
+        return out
+
+
+def mlp_forward(input, label, hidden_size):
+    if _global_parallel_strategy == "dp":
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _global_process_mesh,
+                "dims_mapping": [0, -1]
+            })
+
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        initializer_range=0.02)
+    predict = mlp(input)
+    error_cost = paddle.nn.functional.square_error_cost(predict, label)
+    loss = paddle.mean(error_cost)
+    return loss
+
+
+class TestGradientMergePass(DistPassTestBase):
+    def init(self):
+        self._params_grads = None
+        self._config = {"k_steps": 4, "avg": True}
+
+    def apply_passes(self, main_prog, startup_prog):
+        self._config["params_grads"] = self._params_grads
+        pass_context = PassContext()
+        auto_parallel_gradient_merge_pass = new_pass(
+            "auto_parallel_gradient_merge_pass", self._config)
+        auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog],
+                                                pass_context)
+
+    def test_result(self):
+        no_pass_rets = self._distributed_launch(
+            model=None,
+            apply_pass=False,
+            gpus=[0],
+            gradient_merge=False,
+            batch_size=32,
+            max_step=2)
+        pass_rets = self._distributed_launch(
+            model=None,
+            apply_pass=True,
+            gpus=[0],
+            gradient_merge=True,
+            batch_size=8,
+            max_step=8)
+
+        # avg loss for gradient_merge pass
+        avg_loss = 0
+        pass_avg_ret_list = []
+        for i, pass_ret in enumerate(pass_rets[0]):
+            if (i + 1) % 4 == 0:
+                avg_loss += pass_ret[0]
+                pass_avg_ret_list.append([avg_loss / 4])
+                avg_loss = 0
+            else:
+                avg_loss += pass_ret[0]
+
+        for no_pass_ret, pass_ret in zip(no_pass_rets[0], pass_avg_ret_list):
+            print(f"no_pass_ret={no_pass_ret}, pass_ret={pass_ret}")
+            self.assertTrue(
+                np.isclose(
+                    no_pass_ret,
+                    pass_ret,
+                    rtol=self.rtol,
+                    atol=self.atol,
+                    equal_nan=self.equal_nan))
+
+    def get_model(self, place, gradient_merge, batch_size, max_step):
+        paddle.seed(2021)
+        random.seed(2021)
+        np.random.seed(2021)
+
+        hidden_size = 128
+
+        global _global_parallel_strategy
+        global _global_process_mesh
+        world_size = paddle.distributed.get_world_size()
+        if world_size == 1:
+            _global_parallel_strategy = "dp"
+            _global_process_mesh = auto.ProcessMesh([0])
+        elif world_size == 2:
+            _global_parallel_strategy = "dp"
+            _global_process_mesh = auto.ProcessMesh([0, 1])
+
+        train_program = static.Program()
+        startup_program = static.Program()
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.semi_auto = True
+        #if gradient_merge:
+        #    dist_strategy.gradient_merge = True
+        #    dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
+        fleet.init(is_collective=True, strategy=dist_strategy)
+
+        with static.program_guard(train_program, startup_program), \
+            utils.unique_name.guard():
+            input = static.data(
+                name="input", shape=[batch_size, hidden_size], dtype='float32')
+            label = static.data(
+                name="label", shape=[batch_size, 1], dtype='float32')
+            input.stop_gradient = False
+            loss = mlp_forward(input, label, hidden_size)
+
+        optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01)
+        #optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        _, self._params_grads, dist_startup_prog, dist_main_prog = optimizer.minimize(
+            loss, startup_program)
+
+        input_data = np.random.random(size=(128, hidden_size)).astype('float32')
+        label_data = np.random.random(size=(128, 1)).astype('float32')
+
+        def reader():
+            for i in range(max_step):
+                x_data = input_data[i * batch_size:(i + 1) * batch_size, :]
+                y_data = label_data[i * batch_size:(i + 1) * batch_size, :]
+                yield x_data, y_data
+
+        return dist_main_prog, dist_startup_prog, [input, label], [loss], reader
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt
new file mode 100644
index 0000000000000..f71e04c09aa38
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/distribution/config.py b/python/paddle/fluid/tests/unittests/distribution/config.py
new file mode 100644
index 0000000000000..809dfb2b56d66
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/config.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import sys
+
+import numpy as np
+import paddle
+
+DEVICES = [paddle.CPUPlace()]
+if paddle.is_compiled_with_cuda():
+    DEVICES.append(paddle.CUDAPlace(0))
+
+DEFAULT_DTYPE = 'float64'
+
+TEST_CASE_NAME = 'suffix'
+# All test case will use float64 for compare percision, refs:
+# https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64
+RTOL = {
+    'float32': 1e-03,
+    'complex64': 1e-3,
+    'float64': 1e-5,
+    'complex128': 1e-5
+}
+ATOL = {'float32': 0.0, 'complex64': 0, 'float64': 0.0, 'complex128': 0}
+
+
+def xrand(shape=(10, 10, 10), dtype=DEFAULT_DTYPE, min=1.0, max=10.0):
+    return ((np.random.rand(*shape).astype(dtype)) * (max - min) + min)
+
+
+def place(devices, key='place'):
+    def decorate(cls):
+        module = sys.modules[cls.__module__].__dict__
+        raw_classes = {
+            k: v
+            for k, v in module.items() if k.startswith(cls.__name__)
+        }
+
+        for raw_name, raw_cls in raw_classes.items():
+            for d in devices:
+                test_cls = dict(raw_cls.__dict__)
+                test_cls.update({key: d})
+                new_name = raw_name + '.' + d.__class__.__name__
+                module[new_name] = type(new_name, (raw_cls, ), test_cls)
+            del module[raw_name]
+        return cls
+
+    return decorate
+
+
+def parameterize(fields, values=None):
+
+    fields = [fields] if isinstance(fields, str) else fields
+    params = [dict(zip(fields, vals)) for vals in values]
+
+    def decorate(cls):
+        test_cls_module = sys.modules[cls.__module__].__dict__
+        for k, v in enumerate(params):
+            test_cls = dict(cls.__dict__)
+            test_cls.update(v)
+            name = cls.__name__ + str(k)
+            name = name + '.' + v.get('suffix') if v.get('suffix') else name
+
+            test_cls_module[name] = type(name, (cls, ), test_cls)
+
+        for m in list(cls.__dict__):
+            if m.startswith("test"):
+                delattr(cls, m)
+        return cls
+
+    return decorate
+
+
+@contextlib.contextmanager
+def stgraph(func, *args):
+    """static graph exec context"""
+    paddle.enable_static()
+    mp, sp = paddle.static.Program(), paddle.static.Program()
+    with paddle.static.program_guard(mp, sp):
+        input = paddle.static.data('input', x.shape, dtype=x.dtype)
+        output = func(input, n, axes, norm)
+
+    exe = paddle.static.Executor(place)
+    exe.run(sp)
+    [output] = exe.run(mp, feed={'input': x}, fetch_list=[output])
+    yield output
+    paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/distribution/mock_data.py b/python/paddle/fluid/tests/unittests/distribution/mock_data.py
new file mode 100644
index 0000000000000..a5a6b5542cd90
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/mock_data.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+class Exponential(paddle.distribution.ExponentialFamily):
+    """mock exponential distribution, which support computing entropy and 
+       kl use bregman divergence
+    """
+    _mean_carrier_measure = 0
+
+    def __init__(self, rate):
+        self._rate = rate
+        super(Exponential, self).__init__(batch_shape=rate.shape)
+
+    @property
+    def rate(self):
+        return self._rate
+
+    def entropy(self):
+        return 1.0 - paddle.log(self._rate)
+
+    @property
+    def _natural_parameters(self):
+        return (-self._rate, )
+
+    def _log_normalizer(self, x):
+        return -paddle.log(-x)
+
+
+class DummyExpFamily(paddle.distribution.ExponentialFamily):
+    """dummy class extend from exponential family
+    """
+
+    def __init__(self, *args):
+        pass
+
+    def entropy(self):
+        return 1.0
+
+    @property
+    def _natural_parameters(self):
+        return (1.0, )
+
+    def _log_normalizer(self, x):
+        return -paddle.log(-x)
+
+
+@paddle.distribution.register_kl(Exponential, Exponential)
+def _kl_exponential_exponential(p, q):
+    rate_ratio = q.rate / p.rate
+    t1 = -rate_ratio.log()
+    return t1 + rate_ratio - 1
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_dirichlet_op.py b/python/paddle/fluid/tests/unittests/distribution/test_dirichlet_op.py
new file mode 100644
index 0000000000000..3e7662b573e0d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_dirichlet_op.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import re
+import sys
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid.dygraph as dg
+import paddle.static as static
+import scipy.stats
+from numpy.random import random as rand
+sys.path.append("../")
+from op_test import OpTest
+from paddle.fluid import Program, program_guard
+
+paddle.enable_static()
+
+
+class TestDirichletOp(OpTest):
+    # Because dirichlet random sample have not gradient, we skip gradient check.
+    no_need_check_grad = True
+
+    def setUp(self):
+        self.op_type = "dirichlet"
+        self.alpha = np.array((1., 2.))
+        self.sample_shape = (100000, 2)
+
+        self.inputs = {'Alpha': np.broadcast_to(self.alpha, self.sample_shape)}
+        self.attrs = {}
+        self.outputs = {'Out': np.zeros(self.sample_shape)}
+
+    def test_check_output(self):
+        self.check_output_customized(self._hypothesis_testing)
+
+    def _hypothesis_testing(self, outs):
+        self.assertEqual(outs[0].shape, self.sample_shape)
+        self.assertTrue(np.all(outs[0] > 0.0))
+        self.assertLess(
+            scipy.stats.kstest(
+                outs[0][:, 0],
+                # scipy dirichlet have not cdf, use beta to replace it.
+                scipy.stats.beta(
+                    a=self.alpha[0], b=self.alpha[1]).cdf)[0],
+            0.01)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution.py
new file mode 100644
index 0000000000000..42b658cdce4d2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+
+import numpy as np
+import paddle
+from paddle import fluid
+from paddle.distribution import *
+from paddle.fluid import layers
+
+import config
+
+paddle.enable_static()
+
+
+class DistributionNumpy():
+    def sample(self):
+        raise NotImplementedError
+
+    def entropy(self):
+        raise NotImplementedError
+
+    def kl_divergence(self, other):
+        raise NotImplementedError
+
+    def log_prob(self, value):
+        raise NotImplementedError
+
+    def probs(self, value):
+        raise NotImplementedError
+
+
+class DistributionTestName(unittest.TestCase):
+    def get_prefix(self, string):
+        return (string.split('.')[0])
+
+    def test_normal_name(self):
+        name = 'test_normal'
+        normal1 = Normal(0.0, 1.0, name=name)
+        self.assertEqual(normal1.name, name)
+
+        normal2 = Normal(0.0, 1.0)
+        self.assertEqual(normal2.name, 'Normal')
+
+        paddle.enable_static()
+
+        sample = normal1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = normal1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        layers.assign(value_npdata, value_tensor)
+
+        lp = normal1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+        p = normal1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+        kl = normal1.kl_divergence(normal2)
+        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
+
+    def test_uniform_name(self):
+        name = 'test_uniform'
+        uniform1 = Uniform(0.0, 1.0, name=name)
+        self.assertEqual(uniform1.name, name)
+
+        uniform2 = Uniform(0.0, 1.0)
+        self.assertEqual(uniform2.name, 'Uniform')
+
+        paddle.enable_static()
+
+        sample = uniform1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = uniform1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        layers.assign(value_npdata, value_tensor)
+
+        lp = uniform1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+        p = uniform1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+    def test_categorical_name(self):
+        name = 'test_categorical'
+        categorical1 = Categorical([0.4, 0.6], name=name)
+        self.assertEqual(categorical1.name, name)
+
+        categorical2 = Categorical([0.5, 0.5])
+        self.assertEqual(categorical2.name, 'Categorical')
+
+        paddle.enable_static()
+
+        sample = categorical1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = categorical1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        kl = categorical1.kl_divergence(categorical2)
+        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
+
+        value_npdata = np.array([0], dtype="int64")
+        value_tensor = layers.create_tensor(dtype="int64")
+        layers.assign(value_npdata, value_tensor)
+
+        p = categorical1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+        lp = categorical1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'batch_shape', 'event_shape'),
+                     [('test-tuple', (10, 20),
+                       (10, 20)), ('test-list', [100, 100], [100, 200, 300]),
+                      ('test-null-eventshape', (100, 100), ())])
+class TestDistributionShape(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.dist = paddle.distribution.Distribution(
+            batch_shape=self.batch_shape, event_shape=self.event_shape)
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_batch_shape(self):
+        self.assertTrue(isinstance(self.dist.batch_shape, tuple))
+        self.assertTrue(self.dist.batch_shape == tuple(self.batch_shape))
+
+    def test_event_shape(self):
+        self.assertTrue(isinstance(self.dist.event_shape, tuple))
+        self.assertTrue(self.dist.event_shape == tuple(self.event_shape))
+
+    def test_prob(self):
+        with self.assertRaises(NotImplementedError):
+            self.dist.prob(paddle.to_tensor(config.xrand()))
+
+    def test_extend_shape(self):
+        shapes = [(34, 20), (56, ), ()]
+        for shape in shapes:
+            self.assertTrue(
+                self.dist._extend_shape(shape),
+                shape + self.dist.batch_shape + self.dist.event_shape)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta.py
new file mode 100644
index 0000000000000..1e5267405e8b8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numbers
+import unittest
+
+import numpy as np
+import paddle
+import scipy.stats
+
+from config import (ATOL, DEVICES, RTOL, TEST_CASE_NAME, parameterize, place,
+                    xrand)
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'alpha', 'beta'),
+              [('test-scale', 1.0, 2.0), ('test-tensor', xrand(), xrand()),
+               ('test-broadcast', xrand((2, 1)), xrand((2, 5)))])
+class TestBeta(unittest.TestCase):
+    def setUp(self):
+        # scale no need convert to tensor for scale input unittest
+        alpha, beta = self.alpha, self.beta
+        if not isinstance(self.alpha, numbers.Real):
+            alpha = paddle.to_tensor(self.alpha)
+        if not isinstance(self.beta, numbers.Real):
+            beta = paddle.to_tensor(self.beta)
+
+        self._paddle_beta = paddle.distribution.Beta(alpha, beta)
+
+    def test_mean(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self._paddle_beta.mean,
+                scipy.stats.beta.mean(self.alpha, self.beta),
+                rtol=RTOL.get(str(self._paddle_beta.alpha.numpy().dtype)),
+                atol=ATOL.get(str(self._paddle_beta.alpha.numpy().dtype)))
+
+    def test_variance(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self._paddle_beta.variance,
+                scipy.stats.beta.var(self.alpha, self.beta),
+                rtol=RTOL.get(str(self._paddle_beta.alpha.numpy().dtype)),
+                atol=ATOL.get(str(self._paddle_beta.alpha.numpy().dtype)))
+
+    def test_prob(self):
+        value = [np.random.rand(*self._paddle_beta.alpha.shape)]
+
+        for v in value:
+            with paddle.fluid.dygraph.guard(self.place):
+                np.testing.assert_allclose(
+                    self._paddle_beta.prob(paddle.to_tensor(v)),
+                    scipy.stats.beta.pdf(v, self.alpha, self.beta),
+                    rtol=RTOL.get(str(self._paddle_beta.alpha.numpy().dtype)),
+                    atol=ATOL.get(str(self._paddle_beta.alpha.numpy().dtype)))
+
+    def test_log_prob(self):
+        value = [np.random.rand(*self._paddle_beta.alpha.shape)]
+
+        for v in value:
+            with paddle.fluid.dygraph.guard(self.place):
+                np.testing.assert_allclose(
+                    self._paddle_beta.log_prob(paddle.to_tensor(v)),
+                    scipy.stats.beta.logpdf(v, self.alpha, self.beta),
+                    rtol=RTOL.get(str(self._paddle_beta.alpha.numpy().dtype)),
+                    atol=ATOL.get(str(self._paddle_beta.alpha.numpy().dtype)))
+
+    def test_entropy(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self._paddle_beta.entropy(),
+                scipy.stats.beta.entropy(self.alpha, self.beta),
+                rtol=RTOL.get(str(self._paddle_beta.alpha.numpy().dtype)),
+                atol=ATOL.get(str(self._paddle_beta.alpha.numpy().dtype)))
+
+    def test_sample_shape(self):
+        cases = [
+            {
+                'input': [],
+                'expect': [] + paddle.squeeze(self._paddle_beta.alpha).shape
+            },
+            {
+                'input': [2, 3],
+                'expect': [2, 3] + paddle.squeeze(self._paddle_beta.alpha).shape
+            },
+        ]
+        for case in cases:
+            self.assertTrue(
+                self._paddle_beta.sample(case.get('input')).shape ==
+                case.get('expect'))
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py
new file mode 100644
index 0000000000000..b8d72336807a4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_beta_static.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numbers
+import unittest
+
+import numpy as np
+import paddle
+import scipy.stats
+
+from config import (ATOL, DEVICES, RTOL, TEST_CASE_NAME, parameterize, place,
+                    xrand)
+
+paddle.enable_static()
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'alpha', 'beta'), [('test-tensor', xrand(
+    (10, 10)), xrand((10, 10))), ('test-broadcast', xrand((2, 1)), xrand(
+        (2, 5))), ('test-larger-data', xrand((10, 20)), xrand((10, 20)))])
+class TestBeta(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+        with paddle.static.program_guard(self.program):
+            # scale no need convert to tensor for scale input unittest
+            alpha = paddle.static.data('alpha', self.alpha.shape,
+                                       self.alpha.dtype)
+            beta = paddle.static.data('beta', self.beta.shape, self.beta.dtype)
+            self._paddle_beta = paddle.distribution.Beta(alpha, beta)
+            self.feeds = {'alpha': self.alpha, 'beta': self.beta}
+
+    def test_mean(self):
+        with paddle.static.program_guard(self.program):
+            [mean] = self.executor.run(self.program,
+                                       feed=self.feeds,
+                                       fetch_list=[self._paddle_beta.mean])
+            np.testing.assert_allclose(
+                mean,
+                scipy.stats.beta.mean(self.alpha, self.beta),
+                rtol=RTOL.get(str(self.alpha.dtype)),
+                atol=ATOL.get(str(self.alpha.dtype)))
+
+    def test_variance(self):
+        with paddle.static.program_guard(self.program):
+            [variance] = self.executor.run(
+                self.program,
+                feed=self.feeds,
+                fetch_list=[self._paddle_beta.variance])
+            np.testing.assert_allclose(
+                variance,
+                scipy.stats.beta.var(self.alpha, self.beta),
+                rtol=RTOL.get(str(self.alpha.dtype)),
+                atol=ATOL.get(str(self.alpha.dtype)))
+
+    def test_prob(self):
+
+        with paddle.static.program_guard(self.program):
+
+            value = paddle.static.data('value', self._paddle_beta.alpha.shape,
+                                       self._paddle_beta.alpha.dtype)
+            prob = self._paddle_beta.prob(value)
+
+            random_number = np.random.rand(*self._paddle_beta.alpha.shape)
+            feeds = dict(self.feeds, value=random_number)
+            [prob] = self.executor.run(self.program,
+                                       feed=feeds,
+                                       fetch_list=[prob])
+            np.testing.assert_allclose(
+                prob,
+                scipy.stats.beta.pdf(random_number, self.alpha, self.beta),
+                rtol=RTOL.get(str(self.alpha.dtype)),
+                atol=ATOL.get(str(self.alpha.dtype)))
+
+    def test_log_prob(self):
+        with paddle.static.program_guard(self.program):
+            value = paddle.static.data('value', self._paddle_beta.alpha.shape,
+                                       self._paddle_beta.alpha.dtype)
+            prob = self._paddle_beta.log_prob(value)
+            random_number = np.random.rand(*self._paddle_beta.alpha.shape)
+            feeds = dict(self.feeds, value=random_number)
+            [prob] = self.executor.run(self.program,
+                                       feed=feeds,
+                                       fetch_list=[prob])
+            np.testing.assert_allclose(
+                prob,
+                scipy.stats.beta.logpdf(random_number, self.alpha, self.beta),
+                rtol=RTOL.get(str(self.alpha.dtype)),
+                atol=ATOL.get(str(self.alpha.dtype)))
+
+    def test_entropy(self):
+        with paddle.static.program_guard(self.program):
+            [entropy] = self.executor.run(
+                self.program,
+                feed=self.feeds,
+                fetch_list=[self._paddle_beta.entropy()])
+            np.testing.assert_allclose(
+                entropy,
+                scipy.stats.beta.entropy(self.alpha, self.beta),
+                rtol=RTOL.get(str(self.alpha.dtype)),
+                atol=ATOL.get(str(self.alpha.dtype)))
+
+    def test_sample(self):
+        with paddle.static.program_guard(self.program):
+            [data] = self.executor.run(self.program,
+                                       feed=self.feeds,
+                                       fetch_list=self._paddle_beta.sample())
+            self.assertTrue(data.shape,
+                            np.broadcast_arrays(self.alpha, self.beta)[0].shape)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
new file mode 100644
index 0000000000000..5f3f5e2a9302e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_categorical.py
@@ -0,0 +1,441 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+
+import numpy as np
+import paddle
+from paddle import fluid
+from paddle.distribution import *
+from paddle.fluid import layers
+
+from test_distribution import DistributionNumpy
+
+
+class CategoricalNumpy(DistributionNumpy):
+    def __init__(self, logits):
+        self.logits = np.array(logits).astype('float32')
+
+    def entropy(self):
+        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
+        e_logits = np.exp(logits)
+        z = np.sum(e_logits, axis=-1, keepdims=True)
+        prob = e_logits / z
+        return -1. * np.sum(prob * (logits - np.log(z)), axis=-1, keepdims=True)
+
+    def kl_divergence(self, other):
+        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
+        other_logits = other.logits - np.max(
+            other.logits, axis=-1, keepdims=True)
+        e_logits = np.exp(logits)
+        other_e_logits = np.exp(other_logits)
+        z = np.sum(e_logits, axis=-1, keepdims=True)
+        other_z = np.sum(other_e_logits, axis=-1, keepdims=True)
+        prob = e_logits / z
+        return np.sum(prob *
+                      (logits - np.log(z) - other_logits + np.log(other_z)),
+                      axis=-1,
+                      keepdims=True)
+
+
+class CategoricalTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=3, dims=5):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            self.place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            self.place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+
+        self.batch_size = batch_size
+        self.dims = dims
+        self.init_numpy_data(batch_size, dims)
+
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
+
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float32')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        # dist_shape = logits_shape[:-1], it represents the number of
+        #  different distributions.
+        self.dist_shape = [batch_size]
+        # sample shape represents the number of samples
+        self.sample_shape = [2, 4]
+        # value used in probs and log_prob method
+        # If value is 1-D and logits is 2-D or higher dimension, value will be
+        #  broadcasted to have the same number of distributions with logits.
+        # If value is 2-D or higher dimentsion, it should have the same number
+        #  of distributions with logtis. ``value[:-1] = logits[:-1]
+        self.value_shape = [3]
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.logits = paddle.to_tensor(self.logits_np)
+        self.other_logits = paddle.to_tensor(self.other_logits_np)
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = fluid.data(
+                name='logits', shape=self.logits_shape, dtype='float32')
+            self.other_logits_static = fluid.data(
+                name='other_logits', shape=self.logits_shape, dtype='float32')
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.dist_shape + self.value_shape)
+        for i in range(self.batch_size):
+            for j in range(3):
+                np_probs[i][j] = probability[i][self.value_np[j]]
+        return np_probs
+
+    def compare_with_numpy(self, fetch_list, tolerance=1e-6):
+        sample, entropy, kl, probs, log_prob = fetch_list
+        log_tolerance = 1e-4
+
+        np.testing.assert_equal(sample.shape,
+                                self.sample_shape + self.dist_shape)
+
+        np_categorical = CategoricalNumpy(self.logits_np)
+        np_other_categorical = CategoricalNumpy(self.other_logits_np)
+        np_entropy = np_categorical.entropy()
+        np_kl = np_categorical.kl_divergence(np_other_categorical)
+
+        np.testing.assert_allclose(
+            entropy, np_entropy, rtol=log_tolerance, atol=log_tolerance)
+        np.testing.assert_allclose(
+            kl, np_kl, rtol=log_tolerance, atol=log_tolerance)
+
+        sum_dist = np.sum(self.logits_np, axis=-1, keepdims=True)
+        probability = self.logits_np / sum_dist
+        np_probs = self.get_numpy_selected_probs(probability)
+        np_log_prob = np.log(np_probs)
+
+        np.testing.assert_allclose(
+            probs, np_probs, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            log_prob, np_log_prob, rtol=tolerance, atol=tolerance)
+
+    def test_categorical_distribution_dygraph(self, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        categorical = Categorical(self.logits)
+        other_categorical = Categorical(self.other_logits)
+
+        sample = categorical.sample(self.sample_shape).numpy()
+        entropy = categorical.entropy().numpy()
+        kl = categorical.kl_divergence(other_categorical).numpy()
+        probs = categorical.probs(self.value).numpy()
+        log_prob = categorical.log_prob(self.value).numpy()
+
+        fetch_list = [sample, entropy, kl, probs, log_prob]
+        self.compare_with_numpy(fetch_list)
+
+    def test_categorical_distribution_static(self, tolerance=1e-6):
+        paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            categorical = Categorical(self.logits_static)
+            other_categorical = Categorical(self.other_logits_static)
+
+            sample = categorical.sample(self.sample_shape)
+            entropy = categorical.entropy()
+            kl = categorical.kl_divergence(other_categorical)
+            probs = categorical.probs(self.value_static)
+            log_prob = categorical.log_prob(self.value_static)
+
+            fetch_list = [sample, entropy, kl, probs, log_prob]
+
+        feed_vars = {
+            'logits': self.logits_np,
+            'other_logits': self.other_logits_np,
+            'value': self.value_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class CategoricalTest2(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor with dtype Float64
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float64')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float64')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        self.dist_shape = [batch_size]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = fluid.data(
+                name='logits', shape=self.logits_shape, dtype='float64')
+            self.other_logits_static = fluid.data(
+                name='other_logits', shape=self.logits_shape, dtype='float64')
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest3(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D numpy.ndarray with dtype Float32
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = self.logits_np
+        self.other_logits = self.other_logits_np
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np
+            self.other_logits_static = self.other_logits_np
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest4(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D numpy.ndarray with dtype Float64
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float64')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float64')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        self.dist_shape = [batch_size]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.logits = self.logits_np
+        self.other_logits = self.other_logits_np
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np
+            self.other_logits_static = self.other_logits_np
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+# test shape of logits and value used in probs and log_prob method
+class CategoricalTest5(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 1-D Tensor
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(dims).astype('float32')
+        self.other_logits_np = np.random.rand(dims).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [dims]
+        self.dist_shape = []
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.value_shape)
+        for i in range(3):
+            np_probs[i] = probability[self.value_np[i]]
+        return np_probs
+
+
+class CategoricalTest6(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor
+        # value used in probs and log_prob method has the same number of batches with input
+        self.logits_np = np.random.rand(3, 5).astype('float32')
+        self.other_logits_np = np.random.rand(3, 5).astype('float32')
+        self.value_np = np.array([[2, 1], [0, 3], [2, 3]]).astype('int64')
+
+        self.logits_shape = [3, 5]
+        self.dist_shape = [3]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3, 2]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.value_shape)
+        for i in range(3):
+            for j in range(2):
+                np_probs[i][j] = probability[i][self.value_np[i][j]]
+        return np_probs
+
+
+class CategoricalTest7(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 3-D Tensor
+        # value used in probs and log_prob method has the same number of distribuions with input
+        self.logits_np = np.random.rand(3, 2, 5).astype('float32')
+        self.other_logits_np = np.random.rand(3, 2, 5).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [3, 2, 5]
+        self.dist_shape = [3, 2]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.dist_shape + self.value_shape)
+        for i in range(3):
+            for j in range(2):
+                for k in range(3):
+                    np_probs[i][j][k] = probability[i][j][self.value_np[k]]
+        return np_probs
+
+
+class CategoricalTest8(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D list
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = self.logits_np.tolist()
+        self.other_logits = self.other_logits_np.tolist()
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np.tolist()
+            self.other_logits_static = self.other_logits_np.tolist()
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest9(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D tuple
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = tuple(self.logits_np.tolist())
+        self.other_logits = tuple(self.other_logits_np.tolist())
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = tuple(self.logits_np.tolist())
+            self.other_logits_static = tuple(self.other_logits_np.tolist())
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class DistributionTestError(unittest.TestCase):
+    def test_distribution_error(self):
+        distribution = Distribution()
+
+        self.assertRaises(NotImplementedError, distribution.sample)
+        self.assertRaises(NotImplementedError, distribution.entropy)
+
+        normal = Normal(0.0, 1.0)
+        self.assertRaises(NotImplementedError, distribution.kl_divergence,
+                          normal)
+
+        value_npdata = np.array([0.8], dtype="float32")
+        value_tensor = layers.create_tensor(dtype="float32")
+        self.assertRaises(NotImplementedError, distribution.log_prob,
+                          value_tensor)
+        self.assertRaises(NotImplementedError, distribution.probs, value_tensor)
+
+    def test_normal_error(self):
+        paddle.enable_static()
+        normal = Normal(0.0, 1.0)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, normal.log_prob, value)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, normal.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, normal.sample, shape)
+
+        seed = 1.0
+        # type of seed must be int
+        self.assertRaises(TypeError, normal.sample, [2, 3], seed)
+
+        normal_other = Uniform(1.0, 2.0)
+        # type of other must be an instance of Normal
+        self.assertRaises(TypeError, normal.kl_divergence, normal_other)
+
+    def test_uniform_error(self):
+        paddle.enable_static()
+        uniform = Uniform(0.0, 1.0)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, uniform.log_prob, value)
+
+        value = [1.0, 2.0]
+        # type of value must be variable
+        self.assertRaises(TypeError, uniform.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, uniform.sample, shape)
+
+        seed = 1.0
+        # type of seed must be int
+        self.assertRaises(TypeError, uniform.sample, [2, 3], seed)
+
+    def test_categorical_error(self):
+        paddle.enable_static()
+
+        categorical = Categorical([0.4, 0.6])
+
+        value = [1, 0]
+        # type of value must be variable
+        self.assertRaises(AttributeError, categorical.log_prob, value)
+
+        value = [1, 0]
+        # type of value must be variable
+        self.assertRaises(AttributeError, categorical.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, categorical.sample, shape)
+
+        categorical_other = Uniform(1.0, 2.0)
+        # type of other must be an instance of Categorical
+        self.assertRaises(TypeError, categorical.kl_divergence,
+                          categorical_other)
+
+        def test_shape_not_match_error():
+            # shape of value must match shape of logits
+            # value_shape[:-1] == logits_shape[:-1]
+            paddle.disable_static()
+            logits = paddle.rand([3, 5])
+            cat = Categorical(logits)
+            value = paddle.to_tensor([[2, 1, 3], [3, 2, 1]], dtype='int64')
+            cat.log_prob(value)
+
+        self.assertRaises(ValueError, test_shape_not_match_error)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py
new file mode 100644
index 0000000000000..8baddfa2e9be1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import scipy.stats
+
+import config
+from config import (ATOL, DEVICES, RTOL, TEST_CASE_NAME, parameterize, place,
+                    xrand)
+
+
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'concentration'),
+    [
+        ('test-one-dim', config.xrand((89, ))),
+        # ('test-multi-dim', config.xrand((10, 20, 30)))
+    ])
+class TestDirichlet(unittest.TestCase):
+    def setUp(self):
+        self._paddle_diric = paddle.distribution.Dirichlet(
+            paddle.to_tensor(self.concentration))
+
+    def test_mean(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self._paddle_diric.mean,
+                scipy.stats.dirichlet.mean(self.concentration),
+                rtol=RTOL.get(str(self.concentration.dtype)),
+                atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_variance(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self._paddle_diric.variance,
+                scipy.stats.dirichlet.var(self.concentration),
+                rtol=RTOL.get(str(self.concentration.dtype)),
+                atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_prob(self):
+        value = [np.random.rand(*self.concentration.shape)]
+        value = [v / v.sum() for v in value]
+
+        for v in value:
+            with paddle.fluid.dygraph.guard(self.place):
+                np.testing.assert_allclose(
+                    self._paddle_diric.prob(paddle.to_tensor(v)),
+                    scipy.stats.dirichlet.pdf(v, self.concentration),
+                    rtol=RTOL.get(str(self.concentration.dtype)),
+                    atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_log_prob(self):
+        value = [np.random.rand(*self.concentration.shape)]
+        value = [v / v.sum() for v in value]
+
+        for v in value:
+            with paddle.fluid.dygraph.guard(self.place):
+                np.testing.assert_allclose(
+                    self._paddle_diric.log_prob(paddle.to_tensor(v)),
+                    scipy.stats.dirichlet.logpdf(v, self.concentration),
+                    rtol=RTOL.get(str(self.concentration.dtype)),
+                    atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_entropy(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self._paddle_diric.entropy(),
+                scipy.stats.dirichlet.entropy(self.concentration),
+                rtol=RTOL.get(str(self.concentration.dtype)),
+                atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_natural_parameters(self):
+        self.assertTrue(
+            isinstance(self._paddle_diric._natural_parameters, tuple))
+
+    def test_log_normalizer(self):
+        self.assertTrue(
+            np.all(
+                self._paddle_diric._log_normalizer(
+                    paddle.to_tensor(config.xrand((100, 100, 100)))).numpy() <
+                0.0))
+
+    @place(DEVICES)
+    @parameterize((TEST_CASE_NAME, 'concentration'),
+                  [('test-zero-dim', np.array(1.0))])
+    class TestDirichletException(unittest.TestCase):
+        def TestInit(self):
+            with self.assertRaises(ValueError):
+                paddle.distribution.Dirichlet(
+                    paddle.squeeze(self.concentration))
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py
new file mode 100644
index 0000000000000..c84da943cf6cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_dirichlet_static.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import scipy.stats
+
+from config import (ATOL, DEVICES, RTOL, TEST_CASE_NAME, parameterize, place,
+                    xrand)
+
+paddle.enable_static()
+
+
+@place(DEVICES)
+@parameterize((TEST_CASE_NAME, 'concentration'),
+              [('test-one-dim', np.random.rand(89) + 5.0)])
+class TestDirichlet(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor()
+        with paddle.static.program_guard(self.program):
+            conc = paddle.static.data('conc', self.concentration.shape,
+                                      self.concentration.dtype)
+            self._paddle_diric = paddle.distribution.Dirichlet(conc)
+            self.feeds = {'conc': self.concentration}
+
+    def test_mean(self):
+        with paddle.static.program_guard(self.program):
+            [out] = self.executor.run(self.program,
+                                      feed=self.feeds,
+                                      fetch_list=[self._paddle_diric.mean])
+            np.testing.assert_allclose(
+                out,
+                scipy.stats.dirichlet.mean(self.concentration),
+                rtol=RTOL.get(str(self.concentration.dtype)),
+                atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_variance(self):
+        with paddle.static.program_guard(self.program):
+            [out] = self.executor.run(self.program,
+                                      feed=self.feeds,
+                                      fetch_list=[self._paddle_diric.variance])
+            np.testing.assert_allclose(
+                out,
+                scipy.stats.dirichlet.var(self.concentration),
+                rtol=RTOL.get(str(self.concentration.dtype)),
+                atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_prob(self):
+        with paddle.static.program_guard(self.program):
+            random_number = np.random.rand(*self.concentration.shape)
+            random_number = random_number / random_number.sum()
+            feeds = dict(self.feeds, value=random_number)
+            value = paddle.static.data('value', random_number.shape,
+                                       random_number.dtype)
+            out = self._paddle_diric.prob(value)
+            [out] = self.executor.run(self.program,
+                                      feed=feeds,
+                                      fetch_list=[out])
+            np.testing.assert_allclose(
+                out,
+                scipy.stats.dirichlet.pdf(random_number, self.concentration),
+                rtol=RTOL.get(str(self.concentration.dtype)),
+                atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_log_prob(self):
+        with paddle.static.program_guard(self.program):
+            random_number = np.random.rand(*self.concentration.shape)
+            random_number = random_number / random_number.sum()
+            feeds = dict(self.feeds, value=random_number)
+            value = paddle.static.data('value', random_number.shape,
+                                       random_number.dtype)
+            out = self._paddle_diric.log_prob(value)
+            [out] = self.executor.run(self.program,
+                                      feed=feeds,
+                                      fetch_list=[out])
+            np.testing.assert_allclose(
+                out,
+                scipy.stats.dirichlet.logpdf(random_number, self.concentration),
+                rtol=RTOL.get(str(self.concentration.dtype)),
+                atol=ATOL.get(str(self.concentration.dtype)))
+
+    def test_entropy(self):
+        with paddle.static.program_guard(self.program):
+            [out] = self.executor.run(
+                self.program,
+                feed=self.feeds,
+                fetch_list=[self._paddle_diric.entropy()])
+            np.testing.assert_allclose(
+                out,
+                scipy.stats.dirichlet.entropy(self.concentration),
+                rtol=RTOL.get(str(self.concentration.dtype)),
+                atol=ATOL.get(str(self.concentration.dtype)))
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
new file mode 100644
index 0000000000000..cc2e14d6d6c2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import scipy.stats
+
+import config
+import mock_data as mock
+
+
+@config.place(config.DEVICES)
+@config.parameterize(
+    (config.TEST_CASE_NAME, 'dist'), [('test-mock-exp',
+                                       mock.Exponential(rate=paddle.rand(
+                                           [100, 200, 99],
+                                           dtype=config.DEFAULT_DTYPE)))])
+class TestExponentialFamily(unittest.TestCase):
+    def test_entropy(self):
+        np.testing.assert_allclose(
+            self.dist.entropy(),
+            paddle.distribution.ExponentialFamily.entropy(self.dist),
+            rtol=config.RTOL.get(config.DEFAULT_DTYPE),
+            atol=config.ATOL.get(config.DEFAULT_DTYPE))
+
+
+@config.place(config.DEVICES)
+@config.parameterize(
+    (config.TEST_CASE_NAME, 'dist'),
+    [('test-dummy', mock.DummyExpFamily(0.5, 0.5)),
+     ('test-dirichlet',
+      paddle.distribution.Dirichlet(paddle.to_tensor(config.xrand()))), (
+          'test-beta', paddle.distribution.Beta(
+              paddle.to_tensor(config.xrand()),
+              paddle.to_tensor(config.xrand())))])
+class TestExponentialFamilyException(unittest.TestCase):
+    def test_entropy_exception(self):
+        with self.assertRaises(NotImplementedError):
+            paddle.distribution.ExponentialFamily.entropy(self.dist)
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py
new file mode 100644
index 0000000000000..bb6317f1d56fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_expfamily_static.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import scipy.stats
+
+import config
+import mock_data as mock
+
+paddle.enable_static()
+
+
+@config.place(config.DEVICES)
+class TestExponentialFamily(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor()
+        with paddle.static.program_guard(self.program):
+            rate_np = config.xrand((100, 200, 99))
+            rate = paddle.static.data('rate', rate_np.shape, rate_np.dtype)
+            self.mock_dist = mock.Exponential(rate)
+            self.feeds = {'rate': rate_np}
+
+    def test_entropy(self):
+        with paddle.static.program_guard(self.program):
+            [out1, out2] = self.executor.run(
+                self.program,
+                feed=self.feeds,
+                fetch_list=[
+                    self.mock_dist.entropy(),
+                    paddle.distribution.ExponentialFamily.entropy(
+                        self.mock_dist)
+                ])
+
+            np.testing.assert_allclose(
+                out1,
+                out2,
+                rtol=config.RTOL.get(config.DEFAULT_DTYPE),
+                atol=config.ATOL.get(config.DEFAULT_DTYPE))
+
+    def test_entropy_exception(self):
+        with paddle.static.program_guard(self.program):
+            with self.assertRaises(NotImplementedError):
+                paddle.distribution.ExponentialFamily.entropy(
+                    mock.DummyExpFamily(0.5, 0.5))
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py
new file mode 100644
index 0000000000000..d1ded2256e250
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_normal.py
@@ -0,0 +1,456 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+
+import numpy as np
+import paddle
+from paddle import fluid
+from paddle.distribution import *
+from paddle.fluid import layers
+
+from test_distribution import DistributionNumpy
+
+
+class NormalNumpy(DistributionNumpy):
+    def __init__(self, loc, scale):
+        self.loc = np.array(loc)
+        self.scale = np.array(scale)
+        if str(self.loc.dtype) not in ['float32', 'float64']:
+            self.loc = self.loc.astype('float32')
+            self.scale = self.scale.astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.loc + self.scale).shape
+        return self.loc + (np.random.randn(*shape) * self.scale)
+
+    def log_prob(self, value):
+        var = self.scale * self.scale
+        log_scale = np.log(self.scale)
+        return -((value - self.loc) * (value - self.loc)) / (
+            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+
+    def probs(self, value):
+        var = self.scale * self.scale
+        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
+
+    def entropy(self):
+        return 0.5 + 0.5 * np.log(
+            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
+
+    def kl_divergence(self, other):
+        var_ratio = (self.scale / other.scale)
+        var_ratio = var_ratio * var_ratio
+        t1 = ((self.loc - other.loc) / other.scale)
+        t1 = (t1 * t1)
+        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
+
+
+class NormalTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=2, dims=3):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            self.place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            self.place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+
+        self.init_numpy_data(batch_size, dims)
+
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
+
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # loc ans scale are 'float'
+        self.loc_np = (np.random.ranf() - 0.5) * 4
+        self.scale_np = (np.random.ranf() - 0.5) * 4
+        while self.scale_np < 0:
+            self.scale_np = (np.random.ranf() - 0.5) * 4
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = (np.random.ranf() - 0.5) * 4
+        self.other_scale_np = (np.random.ranf() - 0.5) * 4
+        while self.other_scale_np < 0:
+            self.other_scale_np = (np.random.ranf() - 0.5) * 4
+        self.values_np = np.random.ranf(1).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = self.loc_np
+        self.dynamic_scale = self.scale_np
+        self.dynamic_other_loc = self.other_loc_np
+        self.dynamic_other_scale = self.other_scale_np
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[], dtype='float32')
+
+    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
+        sample, entropy, log_prob, probs, kl = fetch_list
+
+        np_normal = NormalNumpy(self.loc_np, self.scale_np)
+        np_sample = np_normal.sample([sample_shape])
+        np_entropy = np_normal.entropy()
+        np_lp = np_normal.log_prob(self.values_np)
+        np_p = np_normal.probs(self.values_np)
+        np_other_normal = NormalNumpy(self.other_loc_np, self.other_scale_np)
+        np_kl = np_normal.kl_divergence(np_other_normal)
+
+        # Because assign op does not support the input of numpy.ndarray whose dtype is FP64.
+        # When loc and scale are FP64 numpy.ndarray, we need to use assign op to convert it
+        #  to FP32 Tensor. And then use cast op to convert it to a FP64 Tensor.
+        # There is a loss of accuracy in this conversion.
+        # So set the tolerance from 1e-6 to 1e-4.
+        log_tolerance = 1e-4
+
+        np.testing.assert_equal(sample.shape, np_sample.shape)
+        np.testing.assert_allclose(
+            entropy, np_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            log_prob, np_lp, rtol=log_tolerance, atol=log_tolerance)
+        np.testing.assert_allclose(
+            probs, np_p, rtol=log_tolerance, atol=log_tolerance)
+        np.testing.assert_allclose(
+            kl, np_kl, rtol=log_tolerance, atol=log_tolerance)
+
+    def test_normal_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        normal = Normal(self.dynamic_loc, self.dynamic_scale)
+
+        sample = normal.sample([sample_shape]).numpy()
+        entropy = normal.entropy().numpy()
+        log_prob = normal.log_prob(self.dynamic_values).numpy()
+        probs = normal.probs(self.dynamic_values).numpy()
+        other_normal = Normal(self.dynamic_other_loc, self.dynamic_other_scale)
+        kl = normal.kl_divergence(other_normal).numpy()
+
+        fetch_list = [sample, entropy, log_prob, probs, kl]
+        self.compare_with_numpy(fetch_list)
+
+    def test_normal_distribution_static(self, sample_shape=7, tolerance=1e-6):
+        paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            normal = Normal(self.static_loc, self.static_scale)
+
+            sample = normal.sample([sample_shape])
+            entropy = normal.entropy()
+            log_prob = normal.log_prob(self.static_values)
+            probs = normal.probs(self.static_values)
+            other_normal = Normal(self.static_other_loc,
+                                  self.static_other_scale)
+            kl = normal.kl_divergence(other_normal)
+
+            fetch_list = [sample, entropy, log_prob, probs, kl]
+
+        feed_vars = {
+            'loc': self.loc_np,
+            'scale': self.scale_np,
+            'values': self.values_np,
+            'other_loc': self.other_loc_np,
+            'other_scale': self.other_scale_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class NormalTest2(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc ans scale are 'int'
+        self.loc_np = int((np.random.ranf() - 0.5) * 8)
+        self.scale_np = int((np.random.ranf() - 0.5) * 8)
+        while self.scale_np < 0:
+            self.scale_np = int((np.random.ranf() - 0.5) * 8)
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = int((np.random.ranf() - 0.5) * 8)
+        self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
+        while self.other_scale_np < 0:
+            self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
+        self.values_np = np.random.ranf(1).astype('float32')
+
+
+class NormalTest3(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # test broadcast: loc is float, scale is numpy.ndarray with dtype 'float32'.
+        self.loc_np = (np.random.ranf() - 0.5) * 4
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = (np.random.ranf() - 0.5) * 4
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest4(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are numpy.ndarray with dtype 'float32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest5(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are numpy.ndarray with dtype 'float64'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = self.loc_np
+        self.dynamic_scale = self.scale_np
+        self.dynamic_other_loc = self.other_loc_np
+        self.dynamic_other_scale = self.other_scale_np
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class NormalTest6(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np)
+        self.dynamic_scale = paddle.to_tensor(self.scale_np)
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+        self.dynamic_other_loc = paddle.to_tensor(self.other_loc_np)
+        self.dynamic_other_scale = paddle.to_tensor(self.other_scale_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float32')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float32')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float32')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float32')
+
+
+class NormalTest7(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP64'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
+        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+        self.dynamic_other_loc = paddle.to_tensor(
+            self.other_loc_np, dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(
+            self.other_scale_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float64')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float64')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float64')
+
+
+class NormalTest8(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
+        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+        self.dynamic_other_loc = paddle.to_tensor(
+            self.other_loc_np, dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(
+            self.other_scale_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float64')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float64')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float64')
+
+
+class NormalTest9(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are list.
+        self.loc_np = np.random.randn(batch_size,
+                                      dims).astype('float32').tolist()
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = self.scale_np.tolist()
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size,
+                                            dims).astype('float32').tolist()
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.other_scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_scale_np = self.other_scale_np.tolist()
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest10(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are tuple.
+        self.loc_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = tuple(self.scale_np.tolist())
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.other_scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_scale_np = tuple(self.other_scale_np.tolist())
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py
new file mode 100644
index 0000000000000..e6076764b04fe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_uniform.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+
+import numpy as np
+import paddle
+from paddle import fluid
+from paddle.distribution import *
+from paddle.fluid import layers
+
+from test_distribution import DistributionNumpy
+
+
+class UniformNumpy(DistributionNumpy):
+    def __init__(self, low, high):
+        self.low = np.array(low)
+        self.high = np.array(high)
+        if str(self.low.dtype) not in ['float32', 'float64']:
+            self.low = self.low.astype('float32')
+            self.high = self.high.astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.low + self.high).shape
+        return self.low + (np.random.uniform(size=shape) *
+                           (self.high - self.low))
+
+    def log_prob(self, value):
+        lb = np.less(self.low, value).astype(self.low.dtype)
+        ub = np.less(value, self.high).astype(self.low.dtype)
+        return np.log(lb * ub) - np.log(self.high - self.low)
+
+    def probs(self, value):
+        lb = np.less(self.low, value).astype(self.low.dtype)
+        ub = np.less(value, self.high).astype(self.low.dtype)
+        return (lb * ub) / (self.high - self.low)
+
+    def entropy(self):
+        return np.log(self.high - self.low)
+
+
+class UniformTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=5, dims=6):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            self.place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            self.place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+
+        self.init_numpy_data(batch_size, dims)
+
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
+
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # low ans high are 'float'
+        self.low_np = np.random.uniform(-2, 1)
+        self.high_np = np.random.uniform(2, 4)
+        self.values_np = np.array([1.0]).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = self.low_np
+        self.dynamic_high = self.high_np
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[], dtype='float32')
+
+    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
+        sample, entropy, log_prob, probs = fetch_list
+
+        np_uniform = UniformNumpy(self.low_np, self.high_np)
+        np_sample = np_uniform.sample([sample_shape])
+        np_entropy = np_uniform.entropy()
+        np_lp = np_uniform.log_prob(self.values_np)
+        np_p = np_uniform.probs(self.values_np)
+
+        np.testing.assert_equal(sample.shape, np_sample.shape)
+        np.testing.assert_allclose(
+            entropy, np_entropy, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            log_prob, np_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
+
+    def test_uniform_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        uniform = Uniform(self.dynamic_low, self.dynamic_high)
+        sample = uniform.sample([sample_shape]).numpy()
+        entropy = uniform.entropy().numpy()
+        log_prob = uniform.log_prob(self.dynamic_values).numpy()
+        probs = uniform.probs(self.dynamic_values).numpy()
+        fetch_list = [sample, entropy, log_prob, probs]
+
+        self.compare_with_numpy(fetch_list)
+
+    def test_uniform_distribution_static(self, sample_shape=7, tolerance=1e-6):
+        paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            uniform = Uniform(self.static_low, self.static_high)
+            sample = uniform.sample([sample_shape])
+            entropy = uniform.entropy()
+            log_prob = uniform.log_prob(self.static_values)
+            probs = uniform.probs(self.static_values)
+            fetch_list = [sample, entropy, log_prob, probs]
+
+        feed_vars = {
+            'low': self.low_np,
+            'high': self.high_np,
+            'values': self.values_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class UniformTest2(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low ans high are 'int'
+        self.low_np = int(np.random.uniform(-2, 1))
+        self.high_np = int(np.random.uniform(2, 4))
+        self.values_np = np.array([1.0]).astype('float32')
+
+
+class UniformTest3(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # test broadcast: low is float, high is numpy.ndarray with dtype 'float32'.
+        self.low_np = np.random.uniform(-2, 1)
+        self.high_np = np.random.uniform(5.0, 15.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest4(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(5.0, 15.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest5(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float64'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(5.0, 15.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = self.low_np
+        self.dynamic_high = self.high_np
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class UniformTest6(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(5.0, 15.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np)
+        self.dynamic_high = paddle.to_tensor(self.high_np)
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float32')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float32')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest7(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP64'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(5.0, 15.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
+        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float64')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class UniformTest8(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(5.0, 15.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
+        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float32')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float64')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest9(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float32'.
+        # high < low.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(-10.0, -5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest10(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are list.
+        self.low_np = np.random.randn(batch_size,
+                                      dims).astype('float32').tolist()
+        self.high_np = np.random.uniform(
+            5.0, 15.0, (batch_size, dims)).astype('float32').tolist()
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest11(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are tuple.
+        self.low_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.high_np = tuple(
+            np.random.uniform(5.0, 15.0, (batch_size, dims)).astype('float32')
+            .tolist())
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTestSample(unittest.TestCase):
+    def setUp(self):
+        self.init_param()
+
+    def init_param(self):
+        self.low = 3.0
+        self.high = 4.0
+
+    def test_uniform_sample(self):
+        paddle.disable_static()
+        uniform = Uniform(low=self.low, high=self.high)
+        s = uniform.sample([100])
+        self.assertTrue((s >= self.low).all())
+        self.assertTrue((s < self.high).all())
+        paddle.enable_static()
+
+
+class UniformTestSample2(UniformTestSample):
+    def init_param(self):
+        self.low = -5.0
+        self.high = 2.0
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl.py b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
new file mode 100644
index 0000000000000..a1413722446e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_kl.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+import unittest
+
+import numpy as np
+import paddle
+import scipy.special
+import scipy.stats
+from paddle.distribution import kl
+
+import config
+import mock_data as mock
+
+paddle.set_default_dtype('float64')
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'a1', 'b1', 'a2', 'b2'), [
+    ('test_regular_input', 6.0 * np.random.random((4, 5)) + 1e-4,
+     6.0 * np.random.random((4, 5)) + 1e-4, 6.0 * np.random.random(
+         (4, 5)) + 1e-4, 6.0 * np.random.random((4, 5)) + 1e-4),
+])
+class TestKLBetaBeta(unittest.TestCase):
+    def setUp(self):
+        self.p = paddle.distribution.Beta(
+            paddle.to_tensor(self.a1), paddle.to_tensor(self.b1))
+        self.q = paddle.distribution.Beta(
+            paddle.to_tensor(self.a2), paddle.to_tensor(self.b2))
+
+    def test_kl_divergence(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                paddle.distribution.kl_divergence(self.p, self.q),
+                self.scipy_kl_beta_beta(self.a1, self.b1, self.a2, self.b2),
+                rtol=config.RTOL.get(str(self.a1.dtype)),
+                atol=config.ATOL.get(str(self.a1.dtype)))
+
+    def scipy_kl_beta_beta(self, a1, b1, a2, b2):
+        return (scipy.special.betaln(a2, b2) - scipy.special.betaln(a1, b1) +
+                (a1 - a2) * scipy.special.digamma(a1) +
+                (b1 - b2) * scipy.special.digamma(b1) +
+                (a2 - a1 + b2 - b1) * scipy.special.digamma(a1 + b1))
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'conc1', 'conc2'), [
+    ('test-regular-input', np.random.random((5, 7, 8, 10)), np.random.random(
+        (5, 7, 8, 10))),
+])
+class TestKLDirichletDirichlet(unittest.TestCase):
+    def setUp(self):
+        self.p = paddle.distribution.Dirichlet(paddle.to_tensor(self.conc1))
+        self.q = paddle.distribution.Dirichlet(paddle.to_tensor(self.conc2))
+
+    def test_kl_divergence(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                paddle.distribution.kl_divergence(self.p, self.q),
+                self.scipy_kl_diric_diric(self.conc1, self.conc2),
+                rtol=config.RTOL.get(str(self.conc1.dtype)),
+                atol=config.ATOL.get(str(self.conc1.dtype)))
+
+    def scipy_kl_diric_diric(self, conc1, conc2):
+        return (
+            scipy.special.gammaln(np.sum(conc1, -1)) -
+            scipy.special.gammaln(np.sum(conc2, -1)) - np.sum(
+                scipy.special.gammaln(conc1) - scipy.special.gammaln(conc2), -1)
+            + np.sum((conc1 - conc2) *
+                     (scipy.special.digamma(conc1) -
+                      scipy.special.digamma(np.sum(conc1, -1, keepdims=True))),
+                     -1))
+
+
+class DummyDistribution(paddle.distribution.Distribution):
+    pass
+
+
+@config.place(config.DEVICES)
+@config.parameterize(
+    (config.TEST_CASE_NAME, 'p', 'q'),
+    [('test-unregister', DummyDistribution(), DummyDistribution)])
+class TestDispatch(unittest.TestCase):
+    def test_dispatch_with_unregister(self):
+        with self.assertRaises(NotImplementedError):
+            paddle.distribution.kl_divergence(self.p, self.q)
+
+
+@config.place(config.DEVICES)
+@config.parameterize(
+    (config.TEST_CASE_NAME, 'p', 'q'),
+    [('test-diff-dist', mock.Exponential(paddle.rand((100, 200, 100)) + 1.0),
+      mock.Exponential(paddle.rand((100, 200, 100)) + 2.0)),
+     ('test-same-dist', mock.Exponential(paddle.to_tensor(1.0)),
+      mock.Exponential(paddle.to_tensor(1.0)))])
+class TestKLExpfamilyExpFamily(unittest.TestCase):
+    def test_kl_expfamily_expfamily(self):
+        np.testing.assert_allclose(
+            paddle.distribution.kl_divergence(self.p, self.q),
+            kl._kl_expfamily_expfamily(self.p, self.q),
+            rtol=config.RTOL.get(config.DEFAULT_DTYPE),
+            atol=config.ATOL.get(config.DEFAULT_DTYPE))
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py b/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py
new file mode 100644
index 0000000000000..828a7320d474f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_kl_static.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+import unittest
+
+import numpy as np
+import paddle
+import scipy.special
+import scipy.stats
+from paddle.distribution import kl
+
+import config
+import mock_data as mock
+
+paddle.enable_static()
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'a1', 'b1', 'a2', 'b2'), [
+    ('test_regular_input', 6.0 * np.random.random((4, 5)) + 1e-4,
+     6.0 * np.random.random((4, 5)) + 1e-4, 6.0 * np.random.random(
+         (4, 5)) + 1e-4, 6.0 * np.random.random((4, 5)) + 1e-4),
+])
+class TestKLBetaBeta(unittest.TestCase):
+    def setUp(self):
+        self.mp = paddle.static.Program()
+        self.sp = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+
+        with paddle.static.program_guard(self.mp, self.sp):
+            a1 = paddle.static.data('a1', self.a1.shape, dtype=self.a1.dtype)
+            b1 = paddle.static.data('b1', self.b1.shape, dtype=self.b1.dtype)
+            a2 = paddle.static.data('a2', self.a2.shape, dtype=self.a2.dtype)
+            b2 = paddle.static.data('b2', self.b2.shape, dtype=self.b2.dtype)
+
+            self.p = paddle.distribution.Beta(a1, b1)
+            self.q = paddle.distribution.Beta(a2, b2)
+            self.feeds = {
+                'a1': self.a1,
+                'b1': self.b1,
+                'a2': self.a2,
+                'b2': self.b2
+            }
+
+    def test_kl_divergence(self):
+        with paddle.static.program_guard(self.mp, self.sp):
+            out = paddle.distribution.kl_divergence(self.p, self.q)
+            self.executor.run(self.sp)
+            [out] = self.executor.run(self.mp,
+                                      feed=self.feeds,
+                                      fetch_list=[out])
+
+            np.testing.assert_allclose(
+                out,
+                self.scipy_kl_beta_beta(self.a1, self.b1, self.a2, self.b2),
+                rtol=config.RTOL.get(str(self.a1.dtype)),
+                atol=config.ATOL.get(str(self.a1.dtype)))
+
+    def scipy_kl_beta_beta(self, a1, b1, a2, b2):
+        return (scipy.special.betaln(a2, b2) - scipy.special.betaln(a1, b1) +
+                (a1 - a2) * scipy.special.digamma(a1) +
+                (b1 - b2) * scipy.special.digamma(b1) +
+                (a2 - a1 + b2 - b1) * scipy.special.digamma(a1 + b1))
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'conc1', 'conc2'), [
+    ('test-regular-input', np.random.random((5, 7, 8, 10)), np.random.random(
+        (5, 7, 8, 10))),
+])
+class TestKLDirichletDirichlet(unittest.TestCase):
+    def setUp(self):
+        self.mp = paddle.static.Program()
+        self.sp = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+        with paddle.static.program_guard(self.mp, self.sp):
+            conc1 = paddle.static.data('conc1', self.conc1.shape,
+                                       self.conc1.dtype)
+            conc2 = paddle.static.data('conc2', self.conc2.shape,
+                                       self.conc2.dtype)
+            self.p = paddle.distribution.Dirichlet(conc1)
+            self.q = paddle.distribution.Dirichlet(conc2)
+            self.feeds = {'conc1': self.conc1, 'conc2': self.conc2}
+
+    def test_kl_divergence(self):
+
+        with paddle.static.program_guard(self.mp, self.sp):
+            out = paddle.distribution.kl_divergence(self.p, self.q)
+            self.executor.run(self.sp)
+            [out] = self.executor.run(self.mp,
+                                      feed=self.feeds,
+                                      fetch_list=[out])
+            np.testing.assert_allclose(
+                out,
+                self.scipy_kl_diric_diric(self.conc1, self.conc2),
+                rtol=config.RTOL.get(str(self.conc1.dtype)),
+                atol=config.ATOL.get(str(self.conc1.dtype)))
+
+    def scipy_kl_diric_diric(self, conc1, conc2):
+        return (
+            scipy.special.gammaln(np.sum(conc1, -1)) -
+            scipy.special.gammaln(np.sum(conc2, -1)) - np.sum(
+                scipy.special.gammaln(conc1) - scipy.special.gammaln(conc2), -1)
+            + np.sum((conc1 - conc2) *
+                     (scipy.special.digamma(conc1) -
+                      scipy.special.digamma(np.sum(conc1, -1, keepdims=True))),
+                     -1))
+
+
+class DummyDistribution(paddle.distribution.Distribution):
+    pass
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'p', 'q'),
+                     [('test-dispatch-exception')])
+class TestDispatch(unittest.TestCase):
+    def setUp(self):
+        self.mp = paddle.static.Program()
+        self.sp = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+        with paddle.static.program_guard(self.mp, self.sp):
+            self.p = DummyDistribution()
+            self.q = DummyDistribution()
+
+    def test_dispatch_with_unregister(self):
+        with self.assertRaises(NotImplementedError):
+            with paddle.static.program_guard(self.mp, self.sp):
+                out = paddle.distribution.kl_divergence(self.p, self.q)
+                self.executor.run(self.sp)
+                self.executor.run(self.mp, feed={}, fetch_list=[out])
+
+
+@config.place(config.DEVICES)
+@config.parameterize((config.TEST_CASE_NAME, 'rate1', 'rate2'),
+                     [('test-diff-dist', np.random.rand(100, 200, 100) + 1.0,
+                       np.random.rand(100, 200, 100) + 2.0),
+                      ('test-same-dist', np.array([1.0]), np.array([1.0]))])
+class TestKLExpfamilyExpFamily(unittest.TestCase):
+    def setUp(self):
+        self.mp = paddle.static.Program()
+        self.sp = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+        with paddle.static.program_guard(self.mp, self.sp):
+            rate1 = paddle.static.data(
+                'rate1', shape=self.rate1.shape, dtype=self.rate1.dtype)
+            rate2 = paddle.static.data(
+                'rate2', shape=self.rate2.shape, dtype=self.rate2.dtype)
+            self.p = mock.Exponential(rate1)
+            self.q = mock.Exponential(rate2)
+            self.feeds = {'rate1': self.rate1, 'rate2': self.rate2}
+
+    def test_kl_expfamily_expfamily(self):
+        with paddle.static.program_guard(self.mp, self.sp):
+            out1 = paddle.distribution.kl_divergence(self.p, self.q)
+            out2 = kl._kl_expfamily_expfamily(self.p, self.q)
+            self.executor.run(self.sp)
+            [out1, out2] = self.executor.run(self.mp,
+                                             feed=self.feeds,
+                                             fetch_list=[out1, out2])
+
+            np.testing.assert_allclose(
+                out1,
+                out2,
+                rtol=config.RTOL.get(config.DEFAULT_DTYPE),
+                atol=config.ATOL.get(config.DEFAULT_DTYPE))
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
index 04f44f68b4234..ecb7d7f6bd19c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -205,6 +205,7 @@ def __init__(self, hidden_dim=16):
         self.alpha = 10.
         self.constant_vars = {}
 
+    @paddle.jit.to_static
     def forward(self, input):
         hidden_dim = input.shape[-1]
         if hidden_dim != self.hidden_dim:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 91086c31a396a..d18c691325094 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -379,5 +379,74 @@ def test_raise_error(self):
             net.forward.outputs
 
 
+class CallNonForwardFuncNet(paddle.nn.Layer):
+    def __init__(self):
+        super(CallNonForwardFuncNet, self).__init__()
+        self.sub = CallNonForwardFuncSubNet()
+
+    @paddle.jit.to_static
+    def forward(self):
+        return self.sub.func()
+
+
+class CallNonForwardFuncSubNet(paddle.nn.Layer):
+    def __init__(self):
+        super(CallNonForwardFuncSubNet, self).__init__()
+        self.a = paddle.to_tensor([1, 2])
+
+    def func(self):
+        x = self.a * 2
+        return x
+
+
+class TestCallNonForwardFunc(unittest.TestCase):
+    def test_call_non_forward(self):
+        paddle.disable_static()
+        net = CallNonForwardFuncNet()
+        out = net()
+        self.assertEqual(out.numpy().tolist(), [2, 4])
+        paddle.enable_static()
+
+
+class SetBuffersNet1(paddle.nn.Layer):
+    def __init__(self):
+        super(SetBuffersNet1, self).__init__()
+        self.a = paddle.to_tensor([1])
+
+    @paddle.jit.to_static
+    def forward(self):
+        self.a = self.a + 1
+        return self.a
+
+
+class SetBuffersNet2(paddle.nn.Layer):
+    def __init__(self):
+        super(SetBuffersNet2, self).__init__()
+        self.b = paddle.to_tensor([2])
+
+    @paddle.jit.to_static
+    def forward(self):
+        self.b = None
+        self.b = paddle.to_tensor([3])
+        return self.b
+
+
+class TestSetBuffers(unittest.TestCase):
+    def test_set_buffers1(self):
+        paddle.disable_static()
+        net = SetBuffersNet1()
+        out = net()
+        self.assertEqual(out.numpy().tolist(), [2])
+        paddle.jit.save(net, './SetBuffersNet1')
+        paddle.enable_static()
+
+    def test_set_buffers2(self):
+        paddle.disable_static()
+        net = SetBuffersNet2()
+        with self.assertRaises(RuntimeError):
+            out = net()
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 7e999e3b21a88..171685e4a40f7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -410,14 +410,17 @@ def setUp(self):
         self.dyfunc = dyfunc_ifelse_ret_int4
 
     def test_ast_to_func(self):
+        ProgramTranslator().enable(True)
         with self.assertRaises(TypeError):
-            ProgramTranslator().enable(True)
             static_func = paddle.jit.to_static(self.dyfunc)
             out = static_func(self.x)
-
-    def __del__(self):
+        # Why need set `_in_declarative_mode_` here? 
+        # In Dy2St we use `with _switch_declarative_mode_guard_()` to indicate 
+        # that the code block is under @to_static, but in this UT 
+        # an exception is thrown during Dy2St, making the `_in_declarative_mode_` 
+        # a wrong value. So We need set `_in_declarative_mode_` to False manually.
+        paddle.fluid.dygraph.base._in_declarative_mode_ = False
         ProgramTranslator().enable(False)
-        super(TestDy2StIfElseRetInt4, self).__del__()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py
new file mode 100644
index 0000000000000..361fcbf9c73f5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.nn import Layer
+import numpy as np
+import unittest
+
+
+class Net(Layer):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.fc = paddle.nn.Linear(16, 3)
+
+    def forward(self, x, y, m, n):
+        inputs = [x, y, m, n]
+        outs = []
+        for var in inputs:
+            out = paddle.reshape(x, [-1, 16])
+            out = self.fc(out)
+            outs.append(out)
+
+        out = paddle.stack(outs)
+        return paddle.sum(out)
+
+
+class TestArgsSpecName(unittest.TestCase):
+    def read_from_dataset(self):
+        self.x = paddle.randn([4, 2, 8])
+        self.y = paddle.randn([4, 2, 8])
+        self.m = paddle.randn([4, 2, 8])
+        self.n = paddle.randn([4, 2, 8])
+
+    def test_spec_name_hash(self):
+        net = Net()
+        net = paddle.jit.to_static(net)
+        # Convert into program with four input
+        self.read_from_dataset()
+        self.run_test(net, [self.x, self.y, self.m, self.n], 1, [0, 1, 2, 3])
+
+        # Convert into program with three input
+        self.read_from_dataset()
+        self.run_test(net, [self.x, self.x, self.m, self.n], 2, [0, 0, 1, 2])
+
+        # Convert into program with two input
+        self.read_from_dataset()
+        self.run_test(net, [self.x, self.x, self.m, self.m], 3, [0, 0, 1, 1])
+
+        # Use Cache Program
+        self.read_from_dataset()
+        self.run_test(net, [self.n, self.n, self.y, self.y], 3, [0, 0, 1, 1])
+
+        # Convert into program with two input
+        self.read_from_dataset()
+        self.run_test(net, [self.x, self.y, self.x, self.y], 4, [0, 1, 0, 1])
+
+        # Use Cache Program
+        self.read_from_dataset()
+        self.run_test(net, [self.m, self.n, self.m, self.n], 4, [0, 1, 0, 1])
+
+        # Convert into program with one input
+        self.read_from_dataset()
+        self.run_test(net, [self.x, self.x, self.x, self.x], 5, [0, 0, 0, 0])
+
+        # Use Cache Program
+        self.read_from_dataset()
+        self.run_test(net, [self.m, self.m, self.m, self.m], 5, [0, 0, 0, 0])
+
+    def run_test(self, net, inputs, trace_count, mode):
+        out = net(*inputs)
+        self.assertEqual(net.forward.get_traced_count(), trace_count)
+        self.assert_feed_mode(net.forward.inputs, mode)
+
+    def assert_feed_mode(self, inputs, expect_mode):
+        assert isinstance(inputs, list)
+        assert isinstance(expect_mode, list)
+        in_names = [var.name for var in inputs]
+
+        i, name_ids = 0, {}
+
+        def to_idx(name):
+            nonlocal i
+            if name not in name_ids:
+                name_ids[name] = i
+                i += 1
+            return name_ids[name]
+
+        mode = [to_idx(name) for name in in_names]
+        self.assertEquals(mode, expect_mode)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index 6150df5c29a9b..83d53cc22a205 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -229,5 +229,7 @@ def set_strategy(self, strategy, name):
                 "micro_batch_size": 2,
                 "accumulate_steps": 4,
             }
+        elif name == 'asp':
+            strategy.asp = True
         else:
             raise NotImplementedError()
diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
index 7692f8befdf58..c1a2c36d8a344 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
@@ -2,5 +2,8 @@ file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
 
 foreach(target ${TEST_INTERP_CASES})
-  py_test_modules(${target} MODULES ${target})
+  py_test_modules(${target} MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0)
+  py_test_modules(${target}_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0.000001) 
+  py_test_modules(${target}_fast_gc MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0)
+  py_test_modules(${target}_fast_gc_non_eager_deletion MODULES ${target} ENVS FLAGS_allocator_strategy=auto_growth FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0.000001)
 endforeach()
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 01b2cccfc48b2..48f95472c7ec7 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -32,18 +32,16 @@ def setUp(self):
         self.place.set_place(place)
 
     def build_program(self):
-        a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
-        b = paddle.ones([2, 2]) * 2
-        t = paddle.static.nn.fc(a, 2)
-        c = t + b
-
-        main_program = paddle.fluid.default_main_program()
-        startup_program = paddle.fluid.default_startup_program()
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
+            b = paddle.ones([2, 2]) * 2
+            t = paddle.static.nn.fc(a, 2)
+            c = t + b
 
         return startup_program, main_program, c
 
-        return standaloneexecutor, c
-
     def test_interp_base(self):
         startup_program, main_program, c = self.build_program()
         standaloneexecutor = StandaloneExecutor(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 7c40d0e49f326..11abb2623bb22 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -7,6 +7,13 @@ string(REPLACE ".py" "" TEST_TRT_IR_PASSES "${TEST_TRT_IR_PASSES}")
 file(GLOB TEST_TRT_CONVERTER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_trt_convert_*.py")
 string(REPLACE ".py" "" TEST_TRT_CONVERTER "${TEST_TRT_CONVERTER}")
 
+# Only for cpu(mkl + openblas)
+set(TEST_INFERENCE_CPU_UT "test_mul_lstm_fuse_pass" "test_mul_gru_fuse_pass")
+
+foreach(CPU_UT ${TEST_INFERENCE_CPU_UT})
+  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${CPU_UT})
+endforeach()
+
 foreach(TEST_INFERENCE_IR_PASS ${TEST_TRT_IR_PASSES})
   list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${TEST_INFERENCE_IR_PASS})
 endforeach()
@@ -53,6 +60,15 @@ if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
   endforeach()
 endif()
 
+if (NOT WITH_MKLDNN AND NOT TENSORRT_FOUND AND NOT WITH_GPU)
+  foreach(target ${TEST_INFERENCE_CPU_UT})
+    py_test_modules(${target} MODULES ${target})
+  endforeach()
+
+set_tests_properties(test_mul_lstm_fuse_pass PROPERTIES TIMEOUT 300)
+set_tests_properties(test_mul_gru_fuse_pass PROPERTIES TIMEOUT 300)
+endif()
+
 if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
@@ -83,17 +99,37 @@ if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
   set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120)
   set_tests_properties(test_conv_elementwise_add2_act_fuse_pass PROPERTIES TIMEOUT 120)
   set_tests_properties(test_conv_elementwise_add_act_fuse_pass PROPERTIES TIMEOUT 120)
-  set_tests_properties(test_conv_elementwise_add_act_fuse_pass PROPERTIES TIMEOUT 90)
-  set_tests_properties(test_matmul_scale_fuse_pass PROPERTIES TIMEOUT 60)
-  set_tests_properties(test_matmul_v2_scale_fuse_pass PROPERTIES TIMEOUT 60)
+  set_tests_properties(test_flatten2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_squeeze2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
+  if (WIN32)
+    set_tests_properties(test_matmul_scale_fuse_pass PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_matmul_v2_scale_fuse_pass PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_map_matmul_v2_to_matmul_pass PROPERTIES TIMEOUT 360)
+    set_tests_properties(test_map_matmul_v2_to_mul_pass PROPERTIES TIMEOUT 360)
+    set_tests_properties(test_map_matmul_to_mul_pass PROPERTIES TIMEOUT 360)
+  else ()
+    set_tests_properties(test_matmul_scale_fuse_pass PROPERTIES TIMEOUT 60)
+    set_tests_properties(test_matmul_v2_scale_fuse_pass PROPERTIES TIMEOUT 60)
+    set_tests_properties(test_map_matmul_v2_to_matmul_pass PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_map_matmul_v2_to_mul_pass PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_map_matmul_to_mul_pass PROPERTIES TIMEOUT 120)
+  endif()
 endif()
 
 if (WITH_MKLDNN)
+  set_tests_properties(test_mkldnn_conv_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120)
   set_tests_properties(test_mkldnn_depthwise_conv_pass PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_mkldnn_reshape_transpose_matmul_fuse_pass PROPERTIES TIMEOUT 100)
   set_tests_properties(test_mkldnn_prelu_op PROPERTIES TIMEOUT 300)
   set_tests_properties(test_conv_act_mkldnn_fuse_pass PROPERTIES TIMEOUT 120)
   set_tests_properties(test_conv_transpose_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 250)
+  set_tests_properties(test_mkldnn_matmul_transpose_reshape_fuse_pass PROPERTIES TIMEOUT 100)
   set_tests_properties(test_conv_transpose_bn_fuse_pass PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_mkldnn_conv_hard_sigmoid_fuse_pass PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_mkldnn_conv_hard_swish_fuse_pass PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_mkldnn_batch_norm_act_fuse_pass PROPERTIES TIMEOUT 100)
+  set_tests_properties(test_mkldnn_matmul_v2_transpose_reshape_fuse_pass PROPERTIES TIMEOUT 100)
   set_tests_properties(test_mkldnn_conv_transpose_bias_fuse_pass PROPERTIES TIMEOUT 100)
   set_tests_properties(test_conv_eltwiseadd_bn_fuse_pass PROPERTIES TIMEOUT 300)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index 08c634d58cafb..bb8c6e73fdefa 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -440,9 +440,7 @@ def run_test(self, quant=False, prog_configs=None):
 
                 # baseline: no ir_optim run
                 base_config = self.create_inference_config(
-                    ir_optim=False,
-                    use_gpu=pred_config.use_gpu(),
-                    use_mkldnn=pred_config.mkldnn_enabled(), )
+                    ir_optim=False, use_gpu=pred_config.use_gpu())
                 try:
                     # baseline
                     base_result = self.run_test_config(
@@ -699,8 +697,7 @@ def run_test(self, quant=False, *args, **kwargs):
                                              pred_config_deserialize, feed_data)
                 except Exception as e:
                     self.fail_log(
-                        str(prog_config) + ' vs ' + self.inference_config_str(
-                            pred_config) +
+                        self.inference_config_str(pred_config) +
                         '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e)))
                     if not ignore_flag:
                         status = False
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
new file mode 100644
index 0000000000000..6cd9ae970bb58
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestFlatten2MatmulFusePass(PassAutoScanTest):
+    """
+        x_var  
+          |          
+       flatten2 
+          \
+    flatten2_out_var    y_var
+             \           /
+                 matmul      bias_var
+                    \          /
+                   elementwise_add  
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # TRT
+        # config = self.create_trt_inference_config()
+        # config.enable_tensorrt_engine(
+        #     max_batch_size=10,
+        #     workspace_size=102400,
+        #     min_subgraph_size=0,
+        #     precision_mode=paddle_infer.PrecisionType.Float32,
+        #     use_static=False,
+        #     use_calib_mode=False)
+        # yield config, ['mul', 'elementwise_add'], (1e-5, 1e-5)
+
+        # cpu
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ["mul", "elementwise_add"], (1e-5, 1e-5)
+
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["mul", "elementwise_add"], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        # Here we put some skip rules to avoid known bugs
+        def teller1(program_config, predictor_config):
+            if predictor_config.tensorrt_engine_enabled():
+                # On 3080, the results of MatMul and Mul are different 
+                # When the input Y is weight
+                return True
+
+                # On TRT when the input Y is weight, Mul is converted to FC
+                if "matmul_y" not in program_config.weights \
+                    or "bias" not in program_config.weights:
+                    return True
+
+                y_shape = list(program_config.weights["matmul_y"].shape)
+                bias_shape = program_config.weights["bias"].shape
+                axis = program_config.ops[2].attrs["axis"]
+                # bias should be [mul_y_shape[-1]]
+                if axis == 0 or bias_shape[0] != y_shape[1] or len(
+                        bias_shape) != 1:
+                    return True
+            return False
+
+        self.add_ignore_check_case(
+            teller1,
+            IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The pass error on TRT while shape of bias is not [out_size].", )
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape and attr of flatten2
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=10), min_size=4, max_size=4))
+        # [a, b, c, d] => [a, b*c*d]
+        flatten_axis = 1
+        flatten_shape = [x_shape[0], x_shape[1] * x_shape[2] * x_shape[3]]
+
+        # 2. Generate attr:transpose_X/transpose_Y/alpha of matmul
+        alpha = 1.0
+        transpose_X = False
+        transpose_Y = False
+
+        # 3. Generate legal shape of input:Y of matmul
+        y_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        y_shape[0] = flatten_shape[1]
+
+        # 4. Generate legal attr:axis of elementwise_add
+        axis = draw(st.integers(min_value=-1, max_value=1))
+        if axis == 0:
+            bias_shape = [flatten_shape[0], ]
+        elif axis == 1:
+            bias_shape = [y_shape[1]]
+        else:
+            bias_shape = [flatten_shape[0], y_shape[1]]
+            if draw(st.booleans()):
+                bias_shape[1] = 1
+
+        flatten2_op = OpConfig(
+            "flatten2",
+            inputs={"X": ["flatten2_x"], },
+            axis=flatten_axis,
+            outputs={"Out": ["flatten2_out"],
+                     "XShape": ["xshape"]}, )
+        matmul_op = OpConfig(
+            "matmul",
+            inputs={"X": ["flatten2_out"],
+                    "Y": ["matmul_y"]},
+            outputs={"Out": ["matmul_out"]},
+            alpha=alpha,
+            transpose_X=transpose_X,
+            transpose_Y=transpose_Y,
+            fused_reshape_X=[],
+            fused_reshape_Y=[],
+            fused_transpose_X=[],
+            fused_transpose_Y=[],
+            fused_reshape_Out=[],
+            fused_transpose_Out=[], )
+
+        add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["matmul_out"],
+                    "Y": ["bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=axis, )
+
+        ops = [flatten2_op, matmul_op, add_op]
+
+        if draw(st.integers(min_value=1, max_value=10)) <= 8:
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={
+                    "matmul_y": TensorConfig(shape=y_shape),
+                    "bias": TensorConfig(shape=bias_shape),
+                },
+                inputs={"flatten2_x": TensorConfig(shape=x_shape), },
+                outputs=ops[-1].outputs["Out"], )
+        else:
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "flatten2_x": TensorConfig(shape=x_shape),
+                    "matmul_y": TensorConfig(shape=y_shape),
+                    "bias": TensorConfig(shape=bias_shape),
+                },
+                outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=50,
+            max_duration=1000,
+            passes=["flatten2_matmul_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
new file mode 100644
index 0000000000000..810603a4e4732
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestMapMatmulToMulPass(PassAutoScanTest):
+    """
+     x_var    y_var(persistable)
+       \       /
+         matmul  
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # cpu
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ["mul", ], (1e-5, 1e-5)
+
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["mul", ], (1e-5, 1e-5)
+
+        # TRT
+        # config = self.create_trt_inference_config()
+        # config.enable_tensorrt_engine(
+        #     max_batch_size=10,
+        #     workspace_size=10240,
+        #     min_subgraph_size=0,
+        #     precision_mode=paddle_infer.PrecisionType.Float32,
+        #     use_static=False,
+        #     use_calib_mode=False)
+        # yield config, ["mul", ], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        # Here we put some skip rules to avoid known bugs
+        def teller1(program_config, predictor_config):
+            if predictor_config.use_gpu():
+                # On 3080, the results of MatMul and Mul are different
+                return True
+
+            if predictor_config.tensorrt_engine_enabled():
+                # On 3080, the results of MatMul and Mul are different
+                return True
+
+                x_shape = list(program_config.inputs["matmul_x"].shape)
+                if len(x_shape) > 5:
+                    return True
+            return False
+
+        self.add_ignore_check_case(
+            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The pass error on TRT while shape of mul_x > 5.")
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape and attr of matmul
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=5))
+        y_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        y_shape[0] = x_shape[-1]
+        alpha = 1.0
+        transpose_X = False
+        transpose_Y = False
+
+        matmul_op = OpConfig(
+            "matmul",
+            inputs={"X": ["matmul_x"],
+                    "Y": ["matmul_y"]},
+            outputs={"Out": ["matmul_out"]},
+            alpha=alpha,
+            transpose_X=transpose_X,
+            transpose_Y=transpose_Y,
+            fused_reshape_X=[],
+            fused_reshape_Y=[],
+            fused_transpose_X=[],
+            fused_transpose_Y=[],
+            fused_reshape_Out=[],
+            fused_transpose_Out=[], )
+
+        ops = [matmul_op, ]
+        weights = {"matmul_y": TensorConfig(shape=y_shape), }
+        inputs = {"matmul_x": TensorConfig(shape=x_shape), }
+        program_config = ProgramConfig(
+            ops=ops,
+            weights=weights,
+            inputs=inputs,
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=100,
+            passes=["map_matmul_to_mul_pass"],
+            max_duration=180)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
new file mode 100644
index 0000000000000..915644f46e486
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestMapMatmulToMulPass(PassAutoScanTest):
+    """
+     x_var    y_var(persistable)
+       \       /
+        matmul_v2  
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # cpu
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ["matmul", ], (1e-5, 1e-5)
+
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["matmul", ], (1e-5, 1e-5)
+
+        # TRT
+        # config = self.create_trt_inference_config()
+        # config.enable_tensorrt_engine(
+        #     max_batch_size=10,
+        #     workspace_size=10240,
+        #     min_subgraph_size=0,
+        #     precision_mode=paddle_infer.PrecisionType.Float32,
+        #     use_static=False,
+        #     use_calib_mode=False)
+        # yield config, ["matmul", ], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        # Here we put some skip rules to avoid known bugs
+        def teller1(program_config, predictor_config):
+            if predictor_config.tensorrt_engine_enabled():
+                # On 3080, the results of MatMul and Mul are different
+                return True
+
+                x_shape = list(program_config.inputs["matmul_x"].shape)
+                if len(x_shape) > 5:
+                    return True
+            return False
+
+        self.add_ignore_check_case(
+            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The pass error on TRT while shape of mul_x > 5.")
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape and attr of matmul
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=5))
+        y_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        transpose_X = draw(st.booleans())
+        transpose_Y = draw(st.booleans())
+        if transpose_X:
+            if transpose_Y:
+                y_shape[1] = x_shape[-2]
+            else:
+                y_shape[0] = x_shape[-2]
+        else:
+            if transpose_Y:
+                y_shape[1] = x_shape[-1]
+            else:
+                y_shape[0] = x_shape[-1]
+
+        y_shape = x_shape[0:len(x_shape) - 2] + y_shape
+        alpha = 1.0
+
+        matmul_op = OpConfig(
+            "matmul_v2",
+            inputs={"X": ["matmul_x"],
+                    "Y": ["matmul_y"]},
+            outputs={"Out": ["matmul_out"]},
+            alpha=alpha,
+            trans_x=transpose_X,
+            trans_y=transpose_Y,
+            fused_reshape_Out=[],
+            fused_transpose_Out=[],
+            fused_reshape_X=[],
+            fused_reshape_Y=[],
+            fused_transpose_X=[],
+            fused_transpose_Y=[], )
+
+        ops = [matmul_op, ]
+        weights = {}
+        inputs = {
+            "matmul_x": TensorConfig(shape=x_shape),
+            "matmul_y": TensorConfig(shape=y_shape),
+        }
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights=weights,
+            inputs=inputs,
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=100,
+            passes=["map_matmul_v2_to_matmul_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
new file mode 100644
index 0000000000000..cc2c1ab81bb2a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestMapMatmulToMulPass(PassAutoScanTest):
+    """
+     x_var    y_var(persistable)
+       \       /
+       matmul_v2  
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # cpu
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ["mul", ], (1e-5, 1e-5)
+
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["mul", ], (1e-5, 1e-5)
+
+        # TRT
+        # config = self.create_trt_inference_config()
+        # config.enable_tensorrt_engine(
+        #     max_batch_size=10,
+        #     workspace_size=10240,
+        #     min_subgraph_size=0,
+        #     precision_mode=paddle_infer.PrecisionType.Float32,
+        #     use_static=False,
+        #     use_calib_mode=False)
+        # yield config, ["mul", ], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        # Here we put some skip rules to avoid known bugs
+        def teller1(program_config, predictor_config):
+            if predictor_config.tensorrt_engine_enabled():
+                # On 3080, the results of MatMul and Mul are different
+                return True
+
+                x_shape = list(program_config.inputs["matmul_x"].shape)
+                if len(x_shape) > 5:
+                    return True
+            return False
+
+        self.add_ignore_check_case(
+            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The pass error on TRT while shape of mul_x > 5.")
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape and attr of matmul
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=5))
+        y_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        y_shape[0] = x_shape[-1]
+        alpha = 1.0
+        transpose_X = False
+        transpose_Y = False
+
+        matmul_op = OpConfig(
+            "matmul_v2",
+            inputs={"X": ["matmul_x"],
+                    "Y": ["matmul_y"]},
+            outputs={"Out": ["matmul_out"]},
+            alpha=alpha,
+            trans_x=transpose_X,
+            trans_y=transpose_Y,
+            fused_reshape_Out=[],
+            fused_transpose_Out=[],
+            fused_reshape_X=[],
+            fused_reshape_Y=[],
+            fused_transpose_X=[],
+            fused_transpose_Y=[], )
+
+        ops = [matmul_op, ]
+        weights = {"matmul_y": TensorConfig(shape=y_shape), }
+        inputs = {"matmul_x": TensorConfig(shape=x_shape), }
+        program_config = ProgramConfig(
+            ops=ops,
+            weights=weights,
+            inputs=inputs,
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, max_examples=100,
+            passes=["map_matmul_v2_to_mul_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py
index c119cbec884e2..0012ebb05b162 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,68 +11,110 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Test for fusion of batch norm and activation."""
-from __future__ import print_function
 
-import unittest
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
 import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestScaleMatmulMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        data_layout = draw(st.sampled_from(["NCHW", "NHWC"]))
+        epsilon = draw(st.floats(min_value=0.0, max_value=0.001))
+        fuse_with_relu = draw(st.booleans())
+        is_test = draw(st.sampled_from([True]))
+        momentum = draw(st.floats(min_value=0.0, max_value=5))
+        trainable_statistics = False
+        use_global_stats = draw(st.booleans())
+        use_mkldnn1 = draw(st.sampled_from([True]))
+        use_cudnn = draw(st.booleans())
+        use_mkldnn2 = draw(st.sampled_from([True]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+        channel = draw(st.integers(min_value=1, max_value=64))
+        input_dim1 = draw(st.integers(min_value=1, max_value=512))
+        input_dim2 = draw(st.integers(min_value=1, max_value=512))
+
+        def generate_input():
+            shape = [input_dim1, input_dim2]
+            if data_layout == "NCHW":
+                shape.insert(0, channel)
+                shape.insert(0, batch_size)
+            else:
+                shape.append(channel)
+                shape.insert(0, batch_size)
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight():
+            return np.random.random(channel).astype(np.float32)
+
+        batch_norm_op = OpConfig(
+            type="batch_norm",
+            inputs={
+                "X": ["input_data"],
+                "Bias": ["Bias"],
+                "Mean": ["Mean"],
+                "Scale": ["Scale"],
+                "Variance": ["Variance"]
+            },
+            outputs={
+                "Y": ["norm_output"],
+                "MeanOut": ["Mean"],
+                "VarianceOut": ["Variance"],
+                "SavedMean": ["SavedMean"],
+                "SavedVariance": ["SavedVariance"]
+            },
+            attrs={
+                "data_layout": data_layout,
+                "epsilon": epsilon,
+                "fuse_with_relu": fuse_with_relu,
+                "is_test": is_test,
+                "momentum": momentum,
+                "trainable_statistics": trainable_statistics,
+                "use_global_stats": use_global_stats,
+                "use_mkldnn": use_mkldnn1
+            })
+
+        relu_op = OpConfig(
+            type="relu",
+            inputs={"X": ["norm_output"]},
+            outputs={"Out": ["relu_output"]},
+            attrs={"use_cudnn": use_cudnn,
+                   "use_mkldnn": use_mkldnn2})
+
+        model_net = [batch_norm_op, relu_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "Bias": TensorConfig(data_gen=partial(generate_weight)),
+                "Mean": TensorConfig(data_gen=partial(generate_weight)),
+                "Scale": TensorConfig(data_gen=partial(generate_weight)),
+                "Variance": TensorConfig(data_gen=partial(generate_weight))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["relu_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["batch_norm"], (1e-5, 1e-5)
 
-import paddle.fluid as fluid
-from inference_pass_test import InferencePassTest
-from paddle import enable_static
-from paddle.fluid.core import PassVersionChecker
-
-enable_static()
-
-
-class BnReluOneDnnFusePassTest(InferencePassTest):
-    def setUp(self):
-        self.set_params()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
-            bn_out = fluid.layers.batch_norm(
-                input=data, is_test=True, use_global_stats=self.global_stats)
-            relu_out = fluid.layers.relu(bn_out)
-
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [relu_out]
-        self.enable_mkldnn = True
-
-    def set_params(self):
-        self.global_stats = False
-        self.pass_name = "batch_norm_act_fuse_pass"
-
-    def test_check_output(self):
-        self.check_output()
-        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
-
-
-class BnReluGlobalStatsOneDnnFusePassTest(InferencePassTest):
-    def setUp(self):
-        self.set_params()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
-            bn_out = fluid.layers.batch_norm(
-                input=data, is_test=True, use_global_stats=self.global_stats)
-            relu_out = fluid.layers.relu(bn_out)
-
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [relu_out]
-        self.enable_mkldnn = True
-
-    def set_params(self):
-        self.global_stats = True
-        self.pass_name = "batch_norm_act_fuse_pass"
-
-    def test_check_output(self):
-        self.check_output()
-        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+    def test(self):
+        self.run_and_statis(quant=False, passes=["batch_norm_act_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
new file mode 100644
index 0000000000000..66c547de2c280
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+        # If the problem has been fixed, the judgment 
+        # needs to be deleted!!!
+        if attrs[1]['data_format'] == "NHWC":
+            return False
+
+        return True
+
+    def sample_program_config(self, draw):
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        groups = draw(st.sampled_from([1, 2, 4]))
+        paddings = draw(st.sampled_from([[0, 3], [1, 1], [1, 2, 3, 4]]))
+        strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        axis = draw(st.sampled_from([-1, 0, 1]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input1():
+            if data_format == "NCHW":
+                return np.random.random(
+                    [batch_size, 48, 64, 64]).astype(np.float32)
+            else:
+                return np.random.random(
+                    [batch_size, 64, 64, 48]).astype(np.float32)
+
+        def generate_weight1():
+            return np.random.random(
+                [48, int(48 / groups), 3, 3]).astype(np.float32)
+
+        def compute_out_shape(padding_alg):
+            import paddle
+            import paddle.nn as nn
+
+            x_var = paddle.uniform(
+                (batch_size, 48, 64, 64), dtype='float32', min=-1., max=1.)
+            if padding_alg == "EXPLICIT":
+                conv = nn.Conv2D(48, 48, (3, 3), strides, paddings, dilations,
+                                 1)
+            else:
+                conv = nn.Conv2D(48, 48, (3, 3), strides, padding_alg,
+                                 dilations, 1)
+            y_var = conv(x_var)
+            return y_var.shape
+
+        def generate_weight2():
+            return np.random.random([48]).astype(np.float32)
+
+        if compute_out_shape(padding_algorithm) != (batch_size, 48, 64, 64):
+            axis = 1
+
+        relu_op = OpConfig(
+            type="relu",
+            inputs={"X": ["input_data1"]},
+            outputs={"Out": ["sigmoid_out"]},
+            attrs={})
+
+        conv2d_op = OpConfig(
+            type="conv2d",
+            inputs={"Input": ["sigmoid_out"],
+                    "Filter": ["conv_weight"]},
+            outputs={"Output": ["conv_output"]},
+            attrs={
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            })
+
+        if axis == -1 or axis == 0:
+            elt_op = OpConfig(
+                type="elementwise_add",
+                inputs={"X": ["input_data1"],
+                        "Y": ["conv_output"]},
+                outputs={"Out": ["elementwise_output"]},
+                attrs={'axis': axis})
+        else:
+            elt_op = OpConfig(
+                type="elementwise_add",
+                inputs={"X": ["conv_output"],
+                        "Y": ["elementwise_weight"]},
+                outputs={"Out": ["elementwise_output"]},
+                attrs={'axis': axis})
+
+        model_net = [relu_op, conv2d_op, elt_op]
+
+        if axis == 1:
+            program_config = ProgramConfig(
+                ops=model_net,
+                weights={
+                    "conv_weight":
+                    TensorConfig(data_gen=partial(generate_weight1)),
+                    "elementwise_weight":
+                    TensorConfig(data_gen=partial(generate_weight2))
+                },
+                inputs={
+                    "input_data1":
+                    TensorConfig(data_gen=partial(generate_input1))
+                },
+                outputs=["elementwise_output"])
+        else:
+            program_config = ProgramConfig(
+                ops=model_net,
+                weights={
+                    "conv_weight":
+                    TensorConfig(data_gen=partial(generate_weight1))
+                },
+                inputs={
+                    "input_data1":
+                    TensorConfig(data_gen=partial(generate_input1))
+                },
+                outputs=["elementwise_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["relu", "conv2d"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
new file mode 100644
index 0000000000000..a0c4e183930a5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestConvHardSigmoidMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+        # If the problem has been fixed, the judgment 
+        # needs to be deleted!!!
+        if attrs[0]['data_format'] == "NHWC":
+            return False
+
+        return True
+
+    def sample_program_config(self, draw):
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        groups = draw(st.sampled_from([1, 2, 4]))
+        paddings = draw(st.sampled_from([[0, 3], [1, 2, 3, 4]]))
+        strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        slope = draw(st.floats(min_value=0, max_value=10))
+        offset = draw(st.floats(min_value=0, max_value=10))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            if data_format == "NCHW":
+                return np.random.random(
+                    [batch_size, 48, 64, 64]).astype(np.float32)
+            else:
+                return np.random.random(
+                    [batch_size, 64, 64, 48]).astype(np.float32)
+
+        def generate_weight():
+            return np.random.random(
+                [16, int(48 / groups), 3, 3]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "conv2d",
+            "op_inputs": {
+                "Input": ["input_data"],
+                "Filter": ["input_weight"]
+            },
+            "op_outputs": {
+                "Output": ["conv_output"]
+            },
+            "op_attrs": {
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            }
+        }, {
+            "op_type": "hard_sigmoid",
+            "op_inputs": {
+                "X": ["conv_output"]
+            },
+            "op_outputs": {
+                "Out": ["sigmoid_output"]
+            },
+            "op_attrs": {
+                "slope": slope,
+                "offset": offset
+            },
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "input_weight": TensorConfig(data_gen=partial(generate_weight))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input)),
+            },
+            outputs=["sigmoid_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["conv2d"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["conv_hard_sigmoid_mkldnn_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
new file mode 100644
index 0000000000000..17bfb625fd37b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestConvHardSwishMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+        # If the problem has been fixed, the judgment 
+        # needs to be deleted!!!
+        if attrs[0]['data_format'] == "NHWC":
+            return False
+
+        return True
+
+    def sample_program_config(self, draw):
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        groups = draw(st.sampled_from([1, 2, 4]))
+        paddings = draw(st.sampled_from([[0, 3], [1, 2, 3, 4]]))
+        strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]]))
+        threshold = draw(st.sampled_from([6.0]))
+        scale = draw(st.sampled_from([6.0]))
+        offset = draw(st.sampled_from([3.0]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            if data_format == "NCHW":
+                return np.random.random(
+                    [batch_size, 48, 64, 64]).astype(np.float32)
+            else:
+                return np.random.random(
+                    [batch_size, 64, 64, 48]).astype(np.float32)
+
+        def generate_weight():
+            return np.random.random(
+                [16, int(48 / groups), 3, 3]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "conv2d",
+            "op_inputs": {
+                "Input": ["input_data"],
+                "Filter": ["input_weight"]
+            },
+            "op_outputs": {
+                "Output": ["conv_output"]
+            },
+            "op_attrs": {
+                "data_format": data_format,
+                "dilations": dilations,
+                "padding_algorithm": padding_algorithm,
+                "groups": groups,
+                "paddings": paddings,
+                "strides": strides
+            }
+        }, {
+            "op_type": "hard_swish",
+            "op_inputs": {
+                "X": ["conv_output"]
+            },
+            "op_outputs": {
+                "Out": ["swish_output"]
+            },
+            "op_attrs": {
+                "threshold": threshold,
+                "scale": scale,
+                "offset": offset
+            },
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "input_weight": TensorConfig(data_gen=partial(generate_weight))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input)),
+            },
+            outputs=["swish_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["conv2d"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["conv_hard_swish_mkldnn_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
index a6b5e0e54739b..c0d3ff766b8da 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,69 +12,118 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
 import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import AnalysisConfig
-from paddle.fluid.core import PassVersionChecker
-
-
-class MatmulTransposeReshapeMkldnnFusePassTest(InferencePassTest):
-    def setUp(self):
-        self.set_params()
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=self.data_shape, dtype="float32")
-            weight = fluid.layers.create_parameter(
-                shape=self.weight_shape, dtype="float32")
-            matmul = fluid.layers.matmul(
-                data,
-                weight,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y)
-            transpose = fluid.layers.transpose(matmul, self.tranpose_perm)
-            reshape = fluid.layers.reshape(transpose, shape=self.reshape_shape)
-
-        self.fetch_list = [reshape]
-        self.enable_mkldnn = True
-
-    def set_params(self):
-        self.data_shape = [-1, 3, 100, 110]
-        self.weight_shape = [1, 3, 110, 100]
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 110)).astype("float32")
-        }
-        self.transpose_x = False
-        self.transpose_y = False
-        self.tranpose_perm = [0, 2, 1, 3]
-        self.reshape_shape = [3, 100, 100]
-        self.pass_name = 'matmul_transpose_reshape_fuse_pass'
-
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
-
-    def test_pass_compatible(self):
-        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
-
-
-class MatmulTransposeReshapeMkldnnFusePassTest_1(
-        MatmulTransposeReshapeMkldnnFusePassTest):
-    def set_params(self):
-        self.data_shape = [-1, 3, 100, 100]
-        self.weight_shape = [1, 3, 100, 100]
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.transpose_x = True
-        self.transpose_y = True
-        self.tranpose_perm = [0, 2, 1, 3]
-        self.reshape_shape = [6, 50, 100]
-        self.pass_name = 'matmul_transpose_reshape_fuse_pass'
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestMatmulTransposeReshapeMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+        # If the problem has been fixed, the judgment 
+        # needs to be deleted!!!
+        if 0 in attrs[2]['shape']:
+            return False
+
+        return True
+
+    def sample_program_config(self, draw):
+        transpose_X = draw(st.booleans())
+        transpose_Y = draw(st.booleans())
+        alpha = draw(st.floats(min_value=0.01, max_value=2))
+        axis = draw(st.sampled_from([[0, 2, 1, 3]]))
+        shape = draw(st.sampled_from([[0, -1, 128], [-1, 1, 64]]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+        channel = draw(st.integers(min_value=1, max_value=64))
+        input_dim = draw(st.sampled_from([32, 64]))
+
+        def generate_input(type):
+            if transpose_X and transpose_Y:
+                shape_x = [batch_size, channel, input_dim, 32]
+                shape_y = [batch_size, channel, 64, input_dim]
+            elif transpose_X:
+                shape_x = [batch_size, channel, input_dim, 32]
+                shape_y = [batch_size, channel, input_dim, 64]
+            elif transpose_Y:
+                shape_x = [batch_size, channel, 32, input_dim]
+                shape_y = [batch_size, channel, 8, input_dim]
+            else:
+                shape_x = [batch_size, channel, 32, input_dim]
+                shape_y = [batch_size, channel, input_dim, 16]
+
+            if type == "x":
+                return np.random.random(shape_x).astype(np.float32)
+            else:
+                return np.random.random(shape_y).astype(np.float32)
+
+        matmul_op = OpConfig(
+            type="matmul",
+            inputs={"X": ["input_data1"],
+                    "Y": ["input_data2"]},
+            outputs={"Out": ["matmul_output"]},
+            attrs={
+                "transpose_X": transpose_X,
+                "transpose_Y": transpose_Y,
+                "alpha": alpha,
+                "fused_reshape_X": [],
+                "fused_reshape_Y": [],
+                "fused_transpose_X": [],
+                "fused_transpose_Y": [],
+                "fused_reshape_Out": [],
+                "fused_transpose_Out": []
+            })
+
+        transpose2_op = OpConfig(
+            type="transpose2",
+            inputs={"X": ["matmul_output"]},
+            outputs={
+                "Out": ["transpose2_output"],
+                "XShape": ["transpose2_xshape"]
+            },
+            attrs={'axis': axis})
+
+        reshape2_op = OpConfig(
+            type="reshape2",
+            inputs={"X": ["transpose2_output"]},
+            outputs={
+                "Out": ["reshape2_output"],
+                "XShape": ["reshape2_xshape"]
+            },
+            attrs={'shape': shape})
+
+        model_net = [matmul_op, transpose2_op, reshape2_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={},
+            inputs={
+                "input_data1":
+                TensorConfig(data_gen=partial(generate_input, "x")),
+                "input_data2":
+                TensorConfig(data_gen=partial(generate_input, "y"))
+            },
+            outputs=["reshape2_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["matmul"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["matmul_transpose_reshape_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
index 698e399c71ccd..ffdc84b8bd9ff 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
@@ -12,71 +12,142 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
 import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import AnalysisConfig
-from paddle.fluid.core import PassVersionChecker
-
-
-class TestMatmulV2OneDNNTransposeReshapeFusePass(InferencePassTest):
-    def setUp(self):
-        self.set_params()
-        self.tranpose_perm = [0, 2, 1, 3]
-        self.pass_name = 'matmul_v2_transpose_reshape_fuse_pass'
-
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=self.data_shape, dtype="float32")
-            weight = fluid.layers.create_parameter(
-                shape=self.weight_shape, dtype="float32")
-            matmul = paddle.matmul(
-                data,
-                weight,
-                transpose_x=self.transpose_x,
-                transpose_y=self.transpose_y)
-            transpose = fluid.layers.transpose(matmul, self.tranpose_perm)
-            reshape = fluid.layers.reshape(transpose, shape=self.reshape_shape)
-
-        self.fetch_list = [reshape]
-        self.enable_mkldnn = True
-
-    def set_params(self):
-        self.data_shape = [-1, 3, 100, 110]
-        self.weight_shape = [1, 3, 110, 100]
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 110)).astype("float32")
-        }
-        self.transpose_x = False
-        self.transpose_y = False
-        self.reshape_shape = [3, 100, 100]
-
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
-
-    def test_pass_compatible(self):
-        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
-
-
-class TestMatmulV2OneDNNTransposeReshapeFusePassDifferentDims(
-        TestMatmulV2OneDNNTransposeReshapeFusePass):
-    def set_params(self):
-        self.data_shape = [-1, 4, 100, 80]
-        self.weight_shape = [1, 4, 80, 100]
-        self.feeds = {
-            "data": np.random.random((1, 4, 100, 80)).astype("float32")
-        }
-        self.transpose_x = True
-        self.transpose_y = True
-        self.reshape_shape = [8, 40, 80]
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestMatmulv2TransposeReshapeMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        if program_config.inputs["input_data1"].shape[
+                -4] != 1 and program_config.inputs["input_data2"].shape[
+                    -4] != 1:
+            if program_config.inputs["input_data1"].shape[
+                    -4] != program_config.inputs["input_data2"].shape[-4]:
+                return False
+
+        if program_config.inputs["input_data1"].shape[
+                -3] != 1 and program_config.inputs["input_data2"].shape[
+                    -3] != 1:
+            if program_config.inputs["input_data1"].shape[
+                    -3] != program_config.inputs["input_data2"].shape[-3]:
+                return False
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+        # If the problem has been fixed, the judgment 
+        # needs to be deleted!!!
+        if 0 in attrs[2]['shape']:
+            return False
+
+        return True
+
+    def sample_program_config(self, draw):
+        transpose_X = draw(st.booleans())
+        transpose_Y = draw(st.booleans())
+        axis = draw(st.sampled_from([[0, 2, 1, 3]]))
+        shape = draw(st.sampled_from([[0, -1, 128], [-1, 1, 64], [1, -1, 32]]))
+        batch_size1 = draw(st.integers(min_value=1, max_value=4))
+        batch_size2 = draw(st.integers(min_value=1, max_value=4))
+        channel1 = draw(st.sampled_from([1, 16, 32, 64]))
+        channel2 = draw(st.sampled_from([1, 16, 32, 64]))
+        input_dim = draw(st.sampled_from([16, 32, 64]))
+
+        def generate_input(type):
+            if transpose_X and transpose_Y:
+                shape_x = [batch_size1, channel1, input_dim, 32]
+                shape_y = [batch_size2, channel2, 64, input_dim]
+            elif transpose_X:
+                shape_x = [batch_size1, channel1, input_dim, 32]
+                shape_y = [batch_size2, channel2, input_dim, 64]
+            elif transpose_Y:
+                shape_x = [batch_size1, channel1, 32, input_dim]
+                shape_y = [batch_size2, channel2, 8, input_dim]
+            else:
+                shape_x = [batch_size1, channel1, 32, input_dim]
+                shape_y = [batch_size2, channel2, input_dim, 16]
+
+            if type == "x":
+                return np.random.random(shape_x).astype(np.float32)
+            else:
+                return np.random.random(shape_y).astype(np.float32)
+
+        matmul_op = OpConfig(
+            type="matmul_v2",
+            inputs={"X": ["input_data1"],
+                    "Y": ["input_data2"]},
+            outputs={"Out": ["matmul_output"]},
+            attrs={
+                "trans_x": transpose_X,
+                "trans_y": transpose_Y,
+                "fused_reshape_X": [],
+                "fused_reshape_Y": [],
+                "fused_transpose_X": [],
+                "fused_transpose_Y": [],
+                "fused_reshape_Out": [],
+                "fused_transpose_Out": []
+            })
+
+        transpose2_op = OpConfig(
+            type="transpose2",
+            inputs={"X": ["matmul_output"]},
+            outputs={
+                "Out": ["transpose2_output"],
+                "XShape": ["transpose2_xshape"]
+            },
+            attrs={'axis': axis})
+
+        reshape2_op = OpConfig(
+            type="reshape2",
+            inputs={"X": ["transpose2_output"]},
+            outputs={
+                "Out": ["reshape2_output"],
+                "XShape": ["reshape2_xshape"]
+            },
+            attrs={'shape': shape})
+
+        model_net = [matmul_op, transpose2_op, reshape2_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={},
+            inputs={
+                "input_data1":
+                TensorConfig(data_gen=partial(generate_input, "x")),
+                "input_data2":
+                TensorConfig(data_gen=partial(generate_input, "y"))
+            },
+            outputs=["reshape2_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        # map_matmul_v2_to_matmul_pass will affect the type of final fused op 
+        fused_op = "matmul_v2"
+        input1_dim1 = program_config.inputs["input_data1"].shape[0]
+        input2_dim1 = program_config.inputs["input_data2"].shape[0]
+        input1_dim2 = program_config.inputs["input_data1"].shape[1]
+        input2_dim2 = program_config.inputs["input_data2"].shape[1]
+        if input1_dim1 == input2_dim1 and input1_dim2 == input2_dim2:
+            fused_op = "matmul"
+
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, [fused_op], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["matmul_v2_transpose_reshape_fuse_pass"])
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
new file mode 100644
index 0000000000000..9fa98045ef303
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmulv2_op.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import MkldnnAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestMkldnnMatmulv2Op(MkldnnAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        if len(program_config.inputs["input_data2"].shape) == 4:
+            if program_config.inputs["input_data1"].shape[
+                    -4] != 1 and program_config.inputs["input_data2"].shape[
+                        -4] != 1:
+                if program_config.inputs["input_data1"].shape[
+                        -4] != program_config.inputs["input_data2"].shape[-4]:
+                    return False
+
+        if program_config.inputs["input_data1"].shape[
+                -3] != 1 and program_config.inputs["input_data2"].shape[
+                    -3] != 1:
+            if program_config.inputs["input_data1"].shape[
+                    -3] != program_config.inputs["input_data2"].shape[-3]:
+                return False
+        return True
+
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input(type, *args, **kwargs):
+            transpose_X = kwargs["transpose_X"]
+            transpose_Y = kwargs["transpose_Y"]
+            batch_size1 = kwargs["batch_size1"]
+            batch_size2 = kwargs["batch_size2"]
+            channel1 = kwargs["channel1"]
+            channel2 = kwargs["channel2"]
+            input_dim = kwargs["input_dim"]
+            y_dim_len = kwargs["y_dim_len"]
+            if transpose_X and transpose_Y:
+                shape_x = [batch_size1, channel1, input_dim, 32]
+                if y_dim_len == 4:
+                    shape_y = [batch_size2, channel2, 64, input_dim]
+                elif y_dim_len == 3:
+                    shape_y = [channel2, 64, input_dim]
+            elif transpose_X:
+                shape_x = [batch_size1, channel1, input_dim, 32]
+                if y_dim_len == 4:
+                    shape_y = [batch_size2, channel2, input_dim, 64]
+                elif y_dim_len == 3:
+                    shape_y = [channel2, input_dim, 64]
+            elif transpose_Y:
+                shape_x = [batch_size1, channel1, 32, input_dim]
+                if y_dim_len == 4:
+                    shape_y = [batch_size2, channel2, 8, input_dim]
+                elif y_dim_len == 3:
+                    shape_y = [channel2, 8, input_dim]
+            else:
+                shape_x = [batch_size1, channel1, 32, input_dim]
+                if y_dim_len == 4:
+                    shape_y = [batch_size2, channel2, input_dim, 16]
+                elif y_dim_len == 3:
+                    shape_y = [channel2, input_dim, 16]
+
+            if type == "x":
+                return np.random.random(shape_x).astype(np.float32)
+            else:
+                return np.random.random(shape_y).astype(np.float32)
+
+        matmul_op = OpConfig(
+            type="matmul_v2",
+            inputs={"X": ["input_data1"],
+                    "Y": ["input_data2"]},
+            outputs={"Out": ["matmul_output"]},
+            attrs={
+                "trans_x": kwargs["transpose_X"],
+                "trans_y": kwargs["transpose_Y"],
+                "fused_reshape_X": [],
+                "fused_reshape_Y": [],
+                "fused_transpose_X": [],
+                "fused_transpose_Y": [],
+                "fused_reshape_Out": [],
+                "fused_transpose_Out": []
+            })
+
+        program_config = ProgramConfig(
+            ops=[matmul_op],
+            weights={},
+            inputs={
+                "input_data1": TensorConfig(data_gen=partial(
+                    generate_input, "x", *args, **kwargs)),
+                "input_data2": TensorConfig(data_gen=partial(
+                    generate_input, "y", *args, **kwargs))
+            },
+            outputs=["matmul_output"])
+
+        yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, (1e-5, 1e-5)
+
+    @given(
+        transpose_X=st.booleans(),
+        transpose_Y=st.booleans(),
+        y_dim_len=st.sampled_from([3, 4]),
+        batch_size1=st.integers(
+            min_value=1, max_value=4),
+        batch_size2=st.integers(
+            min_value=1, max_value=4),
+        channel1=st.sampled_from([1, 16, 32, 64]),
+        channel2=st.sampled_from([1, 16, 32, 64]),
+        input_dim=st.sampled_from([16, 32, 64]))
+    def test(self, *args, **kwargs):
+        self.run_test(*args, **kwargs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
new file mode 100644
index 0000000000000..952cd27bbaeab
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_fuse_pass.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+from functools import reduce
+
+num = 32 * 64
+
+
+class TestReshapeTransposeMatmulMkldnnFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        transpose_X = draw(st.booleans())
+        transpose_Y = draw(st.booleans())
+        alpha = draw(st.floats(min_value=0.01, max_value=2))
+        axis = draw(st.sampled_from([[0, 2, 1, 3]]))
+        shape = draw(
+            st.sampled_from([[0, 64, -1, 32], [0, 32, -1, 64], [-1, 32, 1, 64]
+                             ]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+        channel = draw(st.integers(min_value=1, max_value=64))
+        input_dim = draw(st.sampled_from([32, 64]))
+
+        def generate_input1(attrs):
+            shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], num]
+            return np.random.random(shape_x).astype(np.float32)
+
+        def generate_input2(attrs):
+            shape_x = [attrs[3]['batch_size'], attrs[3]['channel'], num]
+            input_volume = reduce(lambda x, y: x * y, shape_x)
+            matmul_shape = [i for i in attrs[0]['shape']]
+            if 0 in matmul_shape:
+                for i in range(len(matmul_shape)):
+                    if matmul_shape[i] == 0:
+                        matmul_shape[i] = shape_x[i]
+            shape_volume = reduce(lambda x, y: x * y, matmul_shape)
+
+            if -1 in matmul_shape:
+                for i in range(len(matmul_shape)):
+                    if matmul_shape[i] == -1:
+                        matmul_shape[i] = int(abs(input_volume / shape_volume))
+
+            # Only for transpose axis [0, 2, 1, 3]     
+            matmul_shape[1], matmul_shape[2] = matmul_shape[2], matmul_shape[1]
+
+            if attrs[2]['transpose_X'] and attrs[2]['transpose_Y']:
+                shape_y = [
+                    matmul_shape[0], matmul_shape[1], matmul_shape[-1],
+                    int(num / matmul_shape[-1])
+                ]
+            elif attrs[2]['transpose_X']:
+                shape_y = matmul_shape
+            elif attrs[2]['transpose_Y']:
+                shape_y = matmul_shape
+            else:
+                shape_y = [
+                    matmul_shape[0], matmul_shape[1], matmul_shape[-1],
+                    int(num / matmul_shape[-1])
+                ]
+            return np.random.random(shape_y).astype(np.float32)
+
+        attrs = [{
+            "shape": shape
+        }, {
+            "axis": axis
+        }, {
+            "transpose_X": transpose_X,
+            "transpose_Y": transpose_Y,
+            "alpha": alpha
+        }, {
+            'batch_size': batch_size,
+            'channel': channel,
+            'input_dim': input_dim
+        }]
+
+        ops_config = [{
+            "op_type": "reshape2",
+            "op_inputs": {
+                "X": ["input_data1"]
+            },
+            "op_outputs": {
+                "Out": ["reshape2_output"],
+                "XShape": ["reshape2_xshape"]
+            },
+            "op_attrs": {
+                'shape': attrs[0]['shape']
+            },
+        }, {
+            "op_type": "transpose2",
+            "op_inputs": {
+                "X": ["reshape2_output"]
+            },
+            "op_outputs": {
+                "Out": ["transpose2_output"],
+                "XShape": ["transpose2_xshape"]
+            },
+            "op_attrs": {
+                'axis': attrs[1]['axis']
+            },
+        }, {
+            "op_type": "matmul",
+            "op_inputs": {
+                "X": ["transpose2_output"],
+                "Y": ["input_data2"]
+            },
+            "op_outputs": {
+                "Out": ["matmul_output"]
+            },
+            "op_attrs": {
+                'transpose_X': attrs[2]['transpose_X'],
+                'transpose_Y': attrs[2]['transpose_Y'],
+                'alpha': attrs[2]['alpha'],
+                "fused_reshape_X": [],
+                "fused_reshape_Y": [],
+                "fused_transpose_X": [],
+                "fused_transpose_Y": [],
+                "fused_reshape_Out": [],
+                "fused_transpose_Out": []
+            }
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "input_data1":
+                TensorConfig(data_gen=partial(generate_input1, attrs)),
+                "input_data2":
+                TensorConfig(data_gen=partial(generate_input2, attrs))
+            },
+            outputs=["matmul_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["matmul"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["reshape_transpose_matmul_mkldnn_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py
new file mode 100644
index 0000000000000..9b1400e45bbc0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_gru_fuse_pass.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+from functools import reduce
+
+
+class TestMulGruFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        x_col = draw(st.sampled_from([1]))
+        y_col = draw(st.sampled_from([1]))
+        activation = draw(st.sampled_from(['sigmoid', 'tanh']))
+        is_reverse = draw(st.booleans())
+        has_origin_mode = draw(st.booleans())
+        origin_mode = False
+        gate_activation = draw(st.sampled_from(['sigmoid', 'tanh']))
+        batch_size = draw(st.integers(min_value=1, max_value=40))
+
+        def generate_input():
+            shape = [batch_size, 128, 6, 120]
+            return np.full(shape, 0.001).astype(np.float32)
+
+        def generate_weight(shape):
+            return np.full(shape, 0.0001).astype(np.float32)
+
+        im2sequence_op = OpConfig(
+            type="im2sequence",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["seq_out"]},
+            attrs={
+                "kernels": [6, 1],
+                "out_stride": [1, 1],
+                "paddings": [0, 0, 0, 0],
+                "strides": [1, 1]
+            })
+
+        mul_op = OpConfig(
+            type="mul",
+            inputs={"X": ["seq_out"],
+                    "Y": ["mul_weight"]},
+            outputs={"Out": ["mul_out"]},
+            attrs={"x_num_col_dims": x_col,
+                   "y_num_col_dims": y_col})
+
+        if has_origin_mode:
+            gru_op = OpConfig(
+                type="gru",
+                inputs={
+                    "Input": ["mul_out"],
+                    "Weight": ["gru_weight"],
+                    "Bias": ["gru_bias"]
+                },
+                outputs={
+                    "BatchGate": ["batch_gate"],
+                    "BatchHidden": ["batch_hidden"],
+                    "BatchResetHiddenPrev": ["batch_reset"],
+                    "Hidden": ["hidden"]
+                },
+                attrs={
+                    'activation': activation,
+                    'is_reverse': is_reverse,
+                    'gate_activation': gate_activation,
+                    'is_test': True,
+                    'origin_mode': origin_mode
+                })
+        else:
+            gru_op = OpConfig(
+                type="gru",
+                inputs={
+                    "Input": ["mul_out"],
+                    "Weight": ["gru_weight"],
+                    "Bias": ["gru_bias"]
+                },
+                outputs={
+                    "BatchGate": ["batch_gate"],
+                    "BatchHidden": ["batch_hidden"],
+                    "BatchResetHiddenPrev": ["batch_reset"],
+                    "Hidden": ["hidden"]
+                },
+                attrs={
+                    'activation': activation,
+                    'is_reverse': is_reverse,
+                    'gate_activation': gate_activation,
+                    'is_test': True
+                })
+
+        model_net = [im2sequence_op, mul_op, gru_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "mul_weight":
+                TensorConfig(data_gen=partial(generate_weight, [768, 600])),
+                "gru_weight":
+                TensorConfig(data_gen=partial(generate_weight, [200, 600])),
+                "gru_bias":
+                TensorConfig(data_gen=partial(generate_weight, [1, 600]))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["hidden"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config()
+        yield config, ["im2sequence", "fusion_gru"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(quant=False, passes=["mul_gru_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mul_lstm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_lstm_fuse_pass.py
new file mode 100644
index 0000000000000..c944abb60c86a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mul_lstm_fuse_pass.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+from functools import reduce
+
+
+class TestMulLstmFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        x_col = draw(st.sampled_from([1]))
+        y_col = draw(st.sampled_from([1]))
+        use_peepholes = draw(st.booleans())
+        is_reverse = draw(st.booleans())
+        gate_activation = draw(st.sampled_from(["sigmoid"]))
+        cell_activation = draw(st.sampled_from(["tanh", "relu", "identity"]))
+        candidate_activation = draw(
+            st.sampled_from(["tanh", "relu", "identity"]))
+        batch_size = draw(st.integers(min_value=1, max_value=40))
+
+        def generate_input():
+            shape = [batch_size, 128, 6, 120]
+            return np.full(shape, 0.01).astype(np.float32)
+
+        def generate_weight(shape):
+            return np.full(shape, 0.0001).astype(np.float32)
+
+        im2sequence_op = OpConfig(
+            type="im2sequence",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["seq_out"]},
+            attrs={
+                "kernels": [6, 1],
+                "out_stride": [1, 1],
+                "paddings": [0, 0, 0, 0],
+                "strides": [1, 1]
+            })
+
+        mul_op = OpConfig(
+            type="mul",
+            inputs={"X": ["seq_out"],
+                    "Y": ["mul_weight"]},
+            outputs={"Out": ["mul_out"]},
+            attrs={"x_num_col_dims": x_col,
+                   "y_num_col_dims": y_col})
+
+        lstm_op = OpConfig(
+            type="lstm",
+            inputs={
+                "Input": ["mul_out"],
+                "Weight": ["lstm_weight"],
+                "Bias": ["lstm_bias"]
+            },
+            outputs={
+                "Hidden": ["lstm_hidden"],
+                "Cell": ["lstm_cell"],
+                "BatchGate": ["lstm_gate"],
+                "BatchCellPreAct": ["lstm_batch_cell"]
+            },
+            attrs={
+                'use_peepholes': use_peepholes,
+                'is_reverse': is_reverse,
+                'gate_activation': gate_activation,
+                'cell_activation': cell_activation,
+                'candidate_activation': candidate_activation,
+                'is_test': True
+            })
+
+        model_net = [im2sequence_op, mul_op, lstm_op]
+
+        if use_peepholes:
+            lstm_bias_shape = [1, 1050]
+        else:
+            lstm_bias_shape = [1, 600]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "mul_weight":
+                TensorConfig(data_gen=partial(generate_weight, [768, 600])),
+                "lstm_weight":
+                TensorConfig(data_gen=partial(generate_weight, [150, 600])),
+                "lstm_bias":
+                TensorConfig(data_gen=partial(generate_weight, lstm_bias_shape))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input)),
+            },
+            outputs=["lstm_hidden"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config()
+        yield config, ["im2sequence", "fusion_lstm"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, max_duration=300, passes=["mul_lstm_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
new file mode 100644
index 0000000000000..951ec8e4e8ef4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestReshape2MatmulFusePass(PassAutoScanTest):
+    """
+        x_var  
+          |          
+       reshape2 
+          \
+    reshape2_out_var    y_var
+             \           /
+                 matmul      bias_var
+                    \          /
+                   elementwise_add  
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # TRT
+        # config = self.create_trt_inference_config()
+        # config.enable_tensorrt_engine(
+        #     max_batch_size=10,
+        #     workspace_size=102400,
+        #     min_subgraph_size=0,
+        #     precision_mode=paddle_infer.PrecisionType.Float32,
+        #     use_static=False,
+        #     use_calib_mode=False)
+        # yield config, ['mul', 'elementwise_add'], (1e-5, 1e-5)
+
+        # cpu
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ["mul", "elementwise_add"], (1e-5, 1e-5)
+
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["mul", "elementwise_add"], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        # Here we put some skip rules to avoid known bugs
+        def teller1(program_config, predictor_config):
+            if predictor_config.tensorrt_engine_enabled():
+                # On 3080, the results of MatMul and Mul are different 
+                # When the input Y is weight
+                return True
+
+                # On TRT when the input Y is weight, Mul is converted to FC
+                if "matmul_y" not in program_config.weights \
+                    or "bias" not in program_config.weights:
+                    return True
+
+                y_shape = list(program_config.weights["matmul_y"].shape)
+                bias_shape = program_config.weights["bias"].shape
+                axis = program_config.ops[2].attrs["axis"]
+                # bias should be [mul_y_shape[-1]]
+                if axis == 0 or bias_shape[0] != y_shape[1] or len(
+                        bias_shape) != 1:
+                    return True
+            return False
+
+        self.add_ignore_check_case(
+            teller1,
+            IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The pass error on TRT while shape of bias is not [out_size].", )
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape and attr of reshape2
+        reshape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=10), min_size=2, max_size=2))
+        x_shape = reshape + [1, 1]
+
+        # 2. Generate attr:transpose_X/transpose_Y/alpha of matmul
+        alpha = 1.0
+        transpose_X = False
+        transpose_Y = False
+
+        # 3. Generate legal shape of input:Y of matmul
+        y_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        y_shape[0] = x_shape[1]
+
+        # 4. Generate legal attr:axis of elementwise_add
+        axis = draw(st.integers(min_value=-1, max_value=1))
+        if axis == 0:
+            bias_shape = [x_shape[0]]
+        elif axis == 1:
+            bias_shape = [y_shape[1]]
+        else:
+            bias_shape = [x_shape[0], y_shape[1]]
+            if draw(st.booleans()):
+                bias_shape[1] = 1
+
+        reshape2_op = OpConfig(
+            "reshape2",
+            inputs={"X": ["reshape2_x"], },
+            shape=reshape,
+            outputs={"Out": ["reshape2_out"],
+                     "XShape": ["xshape"]}, )
+        matmul_op = OpConfig(
+            "matmul",
+            inputs={"X": ["reshape2_out"],
+                    "Y": ["matmul_y"]},
+            outputs={"Out": ["matmul_out"]},
+            alpha=alpha,
+            transpose_X=transpose_X,
+            transpose_Y=transpose_Y,
+            fused_reshape_X=[],
+            fused_reshape_Y=[],
+            fused_transpose_X=[],
+            fused_transpose_Y=[],
+            fused_reshape_Out=[],
+            fused_transpose_Out=[], )
+
+        add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["matmul_out"],
+                    "Y": ["bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=axis, )
+
+        ops = [reshape2_op, matmul_op, add_op]
+
+        if draw(st.integers(min_value=1, max_value=10)) <= 8:
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={
+                    "matmul_y": TensorConfig(shape=y_shape),
+                    "bias": TensorConfig(shape=bias_shape),
+                },
+                inputs={"reshape2_x": TensorConfig(shape=x_shape), },
+                outputs=ops[-1].outputs["Out"], )
+        else:
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "reshape2_x": TensorConfig(shape=x_shape),
+                    "matmul_y": TensorConfig(shape=y_shape),
+                    "bias": TensorConfig(shape=bias_shape),
+                },
+                outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=50,
+            max_duration=1000,
+            passes=["reshape2_matmul_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
index 33f215dafda21..c8e939d3926eb 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seq_concat_fc_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,129 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
 import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import AnalysisConfig
-from paddle.fluid.core import PassVersionChecker
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+from functools import reduce
+
+
+class TestSeqConcatFcFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        ref_level = draw(st.sampled_from([0]))
+        axis1 = draw(st.sampled_from([1]))
+        x_col = draw(st.sampled_from([1]))
+        y_col = draw(st.sampled_from([1]))
+        axis2 = draw(st.sampled_from([1]))
+        use_cudnn = False
+        use_mkldnn = False
+        act_type = draw(st.sampled_from(["tanh", "sigmoid", "relu"]))
+        batch_size = draw(st.integers(min_value=1, max_value=1))
+        dim = draw(st.integers(min_value=1, max_value=1000))
+
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        sequence_expand_op1 = OpConfig(
+            type="sequence_expand",
+            inputs={"X": ["input_data1"],
+                    "Y": ["input_data2"]},
+            outputs={"Out": ["seq_exp1_out"]},
+            attrs={"ref_level": ref_level})
+
+        sequence_expand_op2 = OpConfig(
+            type="sequence_expand",
+            inputs={"X": ["input_data1"],
+                    "Y": ["input_data3"]},
+            outputs={"Out": ["seq_exp2_out"]},
+            attrs={"ref_level": ref_level})
+
+        concat_op = OpConfig(
+            type="concat",
+            inputs={"X": ["input_data1", "seq_exp1_out", "seq_exp2_out"]},
+            outputs={"Out": ["concat_output"]},
+            attrs={'axis': axis1})
+
+        mul_op = OpConfig(
+            type="mul",
+            inputs={"X": ["concat_output"],
+                    "Y": ["mul_weight"]},
+            outputs={"Out": ["mul_out"]},
+            attrs={"x_num_col_dims": x_col,
+                   "y_num_col_dims": y_col})
+
+        elt_op = OpConfig(
+            type="elementwise_add",
+            inputs={"X": ["mul_out"],
+                    "Y": ["elt_weight"]},
+            outputs={"Out": ["elt_out"]},
+            attrs={"axis": axis2})
+
+        act_op = OpConfig(
+            type=act_type,
+            inputs={"X": ["elt_out"]},
+            outputs={"Out": ["act_out"]},
+            attrs={"use_cudnn": use_cudnn,
+                   "use_mkldnn": use_mkldnn})
+
+        model_net = [
+            sequence_expand_op1, sequence_expand_op2, concat_op, mul_op, elt_op,
+            act_op
+        ]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "mul_weight":
+                TensorConfig(data_gen=partial(generate_weight, [384, dim])),
+                "elt_weight":
+                TensorConfig(data_gen=partial(generate_weight, [dim]))
+            },
+            inputs={
+                "input_data1": TensorConfig(
+                    data_gen=partial(generate_input, [batch_size, 128]),
+                    lod=[[0, 1]]),
+                "input_data2": TensorConfig(
+                    data_gen=partial(generate_input, [batch_size, 128]),
+                    lod=[[0, 1]]),
+                "input_data3": TensorConfig(
+                    data_gen=partial(generate_input, [batch_size, 128]),
+                    lod=[[0, 1]])
+            },
+            outputs=["act_out"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config()
+        yield config, ["fusion_seqexpand_concat_fc"], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        def teller1(program_config, predictor_config):
+            if program_config.ops[-1].type == "relu":
+                return True
+            return False
 
+        self.add_ignore_check_case(
+            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The pass output has diff in a specific case. We need to fix it as soon as possible."
+        )
 
-class SeqConcatFCFusePassTest(InferencePassTest):
-    def test_compatible(self):
-        self.assertTrue(
-            PassVersionChecker.IsCompatible('seq_concat_fc_fuse_pass'))
+    def test(self):
+        self.run_and_statis(quant=False, passes=["seq_concat_fc_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
index eadda5ba06a79..769720fb2588c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,128 +12,102 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
 import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import AnalysisConfig
-from paddle.fluid.core import PassVersionChecker
-
-
-class SeqconvEltaddReluFusePassTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001)
-            conv_out = fluid.layers.sequence_conv(
-                input=data,
-                num_filters=16,
-                filter_size=4,
-                padding_start=0,
-                act="relu",
-                bias_attr=param_attr)
-
-        np_data = np.random.random((80, 100)).astype('float32')
-        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
-                                               fluid.CPUPlace())
-        self.feeds = {"data": x_lod_tensor}
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-    def test_check_output(self):
-        self.check_output()
-        self.assertTrue(
-            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
-
-
-class SeqconvEltaddReluFusePassTestPaddingStartPositive(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(name="data", shape=[-1, 4], dtype="float32")
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001)
-            conv_out = fluid.layers.sequence_conv(
-                input=data,
-                num_filters=16,
-                filter_size=3,
-                padding_start=2,
-                act="relu",
-                bias_attr=param_attr)
-
-        np_data = np.array([[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3],
-                            [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7]]).astype('float32')
-        x_lod_tensor = fluid.create_lod_tensor(np_data, [[5, 2]],
-                                               fluid.CPUPlace())
-        self.feeds = {"data": x_lod_tensor}
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-    def test_check_output(self):
-        self.check_output()
-        self.assertTrue(
-            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
-
-
-class SeqconvEltaddReluFusePassTestPaddingStartNegative(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001)
-            conv_out = fluid.layers.sequence_conv(
-                input=data,
-                num_filters=16,
-                filter_size=4,
-                padding_start=-1,
-                act="relu",
-                bias_attr=param_attr)
-
-        np_data = np.random.random((80, 100)).astype('float32')
-        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
-                                               fluid.CPUPlace())
-        self.feeds = {"data": x_lod_tensor}
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-    def test_check_output(self):
-        self.check_output()
-        self.assertTrue(
-            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
-
-
-class SeqconvEltaddReluFusePassTestPaddingStartNone(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
-            param_attr = fluid.ParamAttr(
-                initializer=fluid.initializer.Xavier(uniform=False),
-                learning_rate=0.001)
-            conv_out = fluid.layers.sequence_conv(
-                input=data,
-                num_filters=16,
-                filter_size=4,
-                act="relu",
-                bias_attr=param_attr)
-
-        np_data = np.random.random((80, 100)).astype('float32')
-        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
-                                               fluid.CPUPlace())
-        self.feeds = {"data": x_lod_tensor}
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-
-    def test_check_output(self):
-        self.check_output()
-        self.assertTrue(
-            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+from functools import reduce
+
+
+class TestSeqconvEltaddReluFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        contextLength = draw(st.sampled_from([1, 2, 3, 4]))
+        contextStart = draw(st.sampled_from([1, 2, 3]))
+        contextStride = draw(st.sampled_from([1]))
+        paddingTrainable = False
+        axis = draw(st.sampled_from([1]))
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        def generate_input():
+            shape = [batch_size, 128, 6, 120]
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        im2sequence_op = OpConfig(
+            type="im2sequence",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["seq_out"]},
+            attrs={
+                "kernels": [6, 1],
+                "out_stride": [1, 1],
+                "paddings": [0, 0, 0, 0],
+                "strides": [1, 1]
+            })
+
+        sequence_conv_op = OpConfig(
+            type="sequence_conv",
+            inputs={"X": ["seq_out"],
+                    "Filter": ["conv_weight"]},
+            outputs={"Out": ["conv_out"]},
+            attrs={
+                "contextLength": contextLength,
+                "contextStart": contextStart,
+                "contextStride": contextStride,
+                "paddingTrainable": paddingTrainable
+            })
+
+        elementwise_add_op = OpConfig(
+            type="elementwise_add",
+            inputs={"X": ["conv_out"],
+                    "Y": ["elt_weight"]},
+            outputs={"Out": ["elt_output"]},
+            attrs={'axis': axis})
+
+        relu_op = OpConfig(
+            type="relu",
+            inputs={"X": ["elt_output"]},
+            outputs={"Out": ["relu_output"]},
+            attrs={})
+
+        model_net = [
+            im2sequence_op, sequence_conv_op, elementwise_add_op, relu_op
+        ]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={
+                "conv_weight": TensorConfig(data_gen=partial(
+                    generate_weight, [768 * contextLength, 16])),
+                "elt_weight":
+                TensorConfig(data_gen=partial(generate_weight, [16]))
+            },
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["relu_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config()
+        yield config, ["im2sequence", "fusion_seqconv_eltadd_relu"], (1e-5,
+                                                                      1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["seqconv_eltadd_relu_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
new file mode 100644
index 0000000000000..605dc4edbe8c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestSqueeze2MatmulFusePass(PassAutoScanTest):
+    """
+        x_var  
+          |          
+       squeeze2 
+          \
+    squeeze2_out_var    y_var
+             \           /
+                 matmul      bias_var
+                    \          /
+                   elementwise_add  
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # cpu
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ["mul", "elementwise_add"], (1e-5, 1e-5)
+
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["mul", "elementwise_add"], (1e-5, 1e-5)
+
+        # TRT
+        # config = self.create_trt_inference_config()
+        # config.enable_tensorrt_engine(
+        #     max_batch_size=10,
+        #     workspace_size=10240,
+        #     min_subgraph_size=0,
+        #     precision_mode=paddle_infer.PrecisionType.Float32,
+        #     use_static=False,
+        #     use_calib_mode=False)
+        # yield config, ['mul', 'elementwise_add'], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        # Here we put some skip rules to avoid known bugs
+        def teller1(program_config, predictor_config):
+            if predictor_config.tensorrt_engine_enabled():
+                # On 3080, the results of MatMul and Mul are different 
+                # When the input Y is weight
+                return True
+
+                # On TRT when the input Y is weight, Mul is converted to FC
+                predictor_config.exp_disable_tensorrt_ops(["elementwise_add"])
+                if "matmul_y" not in program_config.weights \
+                       or "bias" not in program_config.weights:
+                    return True
+
+                y_shape = list(program_config.weights["matmul_y"].shape)
+                bias_shape = program_config.weights["bias"].shape
+                axis = program_config.ops[2].attrs["axis"]
+                # bias should be [mul_y_shape[-1]]
+                if axis == 0 or bias_shape[0] != y_shape[1] or len(
+                        bias_shape) != 1:
+                    return True
+            return False
+
+        self.add_ignore_check_case(
+            teller1,
+            IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The pass error on TRT while shape of bias is not [out_size].", )
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape of input:X of squeeze2
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        # axes of squeeze2 == [2, 3]
+        x_shape += [1, 1]
+        axes = [2, 3]
+
+        # 2. Generate attr:transpose_X/transpose_Y/alpha of matmul
+        alpha = 1.0
+        transpose_X = False
+        transpose_Y = False
+
+        # 3. Generate legal shape of input:Y of matmul
+        y_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        y_shape[0] = x_shape[1]
+
+        # 4. Generate legal attr:axis of elementwise_add
+        axis = draw(st.integers(min_value=-1, max_value=1))
+        if axis == 0 or axis == -1:
+            bias_shape = [x_shape[0], y_shape[1]]
+        else:
+            bias_shape = [y_shape[1], ]
+        if draw(st.booleans()):
+            bias_shape[-1] = 1
+            if len(bias_shape) == 2 and draw(st.booleans()):
+                bias_shape[0] = 1
+
+        axis = 0
+        bias_shape = [2, ]
+        x_shape = [2, 1, 1, 1]
+        y_shape = [1, 2]
+
+        squeeze2_op = OpConfig(
+            "squeeze2",
+            inputs={"X": ["squeeze2_x"], },
+            axes=axes,
+            outputs={"Out": ["squeeze2_out"],
+                     "XShape": ["xshape"]}, )
+        matmul_op = OpConfig(
+            "matmul",
+            inputs={"X": ["squeeze2_out"],
+                    "Y": ["matmul_y"]},
+            outputs={"Out": ["matmul_out"]},
+            alpha=alpha,
+            transpose_X=transpose_X,
+            transpose_Y=transpose_Y,
+            fused_reshape_X=[],
+            fused_reshape_Y=[],
+            fused_transpose_X=[],
+            fused_transpose_Y=[],
+            fused_reshape_Out=[],
+            fused_transpose_Out=[], )
+
+        add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["matmul_out"],
+                    "Y": ["bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=axis, )
+
+        ops = [squeeze2_op, matmul_op, add_op]
+
+        if draw(st.integers(min_value=1, max_value=10)) <= 8:
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={
+                    "matmul_y": TensorConfig(shape=y_shape),
+                    "bias": TensorConfig(shape=bias_shape),
+                },
+                inputs={"squeeze2_x": TensorConfig(shape=x_shape), },
+                outputs=ops[-1].outputs["Out"], )
+        else:
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "squeeze2_x": TensorConfig(shape=x_shape),
+                    "matmul_y": TensorConfig(shape=y_shape),
+                    "bias": TensorConfig(shape=bias_shape),
+                },
+                outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=50,
+            max_duration=1000,
+            passes=["squeeze2_matmul_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index b54b923d3b086..1c5b640fe4b0b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -215,11 +215,11 @@ def generate_dynamic_shape(attrs):
                     "input_data2": [1]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "input_data1": [256],
+                    "input_data1": [128],
                     "input_data2": [128]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "input_data1": [16],
+                    "input_data1": [32],
                     "input_data2": [32]
                 }
             elif self.dims == 2:
@@ -232,7 +232,7 @@ def generate_dynamic_shape(attrs):
                     "input_data2": [128, 256]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "input_data1": [2, 16],
+                    "input_data1": [32, 64],
                     "input_data2": [32, 64]
                 }
             elif self.dims == 3:
@@ -241,11 +241,11 @@ def generate_dynamic_shape(attrs):
                     "input_data2": [1, 4, 4]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "input_data1": [128, 256, 128],
+                    "input_data1": [128, 128, 256],
                     "input_data2": [128, 128, 256]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "input_data1": [2, 32, 16],
+                    "input_data1": [2, 64, 64],
                     "input_data2": [2, 64, 64]
                 }
             elif self.dims == 4:
@@ -254,11 +254,11 @@ def generate_dynamic_shape(attrs):
                     "input_data2": [1, 4, 4, 4]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "input_data1": [8, 32, 64, 64],
+                    "input_data1": [8, 128, 64, 128],
                     "input_data2": [8, 128, 64, 128]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "input_data1": [2, 32, 32, 16],
+                    "input_data1": [2, 64, 32, 32],
                     "input_data2": [2, 64, 32, 32]
                 }
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
index b09ae80555e08..ba648042dabf7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_mean.py
@@ -120,7 +120,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-4, 1e-4)
+            attrs, False), (5e-4, 5e-4)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -129,7 +129,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                                                                      True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-4, 1e-4)
+            attrs, True), (5e-4, 5e-4)
 
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
index cd9987b3c8e82..c421a6d117e45 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
@@ -82,6 +82,20 @@ def op_grad(self, dout, x):
         return dout * self.op_forward(x) * (1 - self.op_forward(x))
 
 
+class TestMKLDNNSqrtBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+    def config(self):
+        self.op_type = "sqrt"
+
+    def init_data(self):
+        self.x = np.random.uniform(1, 2, [2, 4, 3, 5]).astype(np.float32)
+
+    def op_forward(self, x):
+        return np.sqrt(x)
+
+    def op_grad(self, dout, x):
+        return dout / (2 * np.sqrt(x))
+
+
 class TestMKLDNNGeluErfBF16Op(MKLDNNBF16ActivationOp, TestActivation):
     def config(self):
         self.op_type = "gelu"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 6ee266a93d56a..8af2101346fec 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -349,6 +349,16 @@ def set_alpha(self):
         self.alpha = 2.5
 
 
+class TestMKLDNNExpOp(TestActivation):
+    def setUp(self):
+        self.op_type = "exp"
+        x = np.random.random((5, 5, 4)).astype("float32")
+
+        self.inputs = {'X': x}
+        self.attrs = {'use_mkldnn': True}
+        self.outputs = {'Out': np.exp(x)}
+
+
 # Check if primitives already exist in backward
 class TestMKLDNNAbsPrimitivesAlreadyExist(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
index 7ab738ea577fc..4c753da0512f8 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -19,7 +19,7 @@
 import struct
 
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, OpTestTool
 from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2DOp
 
 
@@ -31,7 +31,7 @@ def conv2d_residual_naive(out, residual):
 
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
-class TestConv2DBf16Op(TestConv2DOp):
+class TestConv2DBF16Op(TestConv2DOp):
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -51,15 +51,19 @@ def setUp(self):
         self.init_data_type()
         self.init_force_fp32_output()
 
-        conv2d_param = {
+        self.conv2d_param = {
             'stride': self.stride,
             'pad': self.pad,
             'dilation': self.dilations
         }
+
         self.input = np.random.random(self.input_size).astype(np.float32)
         self.filter = np.random.random(self.filter_size).astype(np.float32)
-        conv_out, _, _, _, _ = conv2d_forward_naive(self.input, self.filter,
-                                                    self.groups, conv2d_param)
+
+        self.inputs_fp32 = {'Input': self.input, 'Filter': self.filter}
+
+        conv_out, _, _, _, _ = conv2d_forward_naive(
+            self.input, self.filter, self.groups, self.conv2d_param)
         self.conv_output_float = conv_out
 
         if self.fuse_residual:
@@ -71,12 +75,16 @@ def setUp(self):
             self.outputs = {'Output': self.conv_output}
         elif self.force_fp32_output:
             self.outputs = {'Output': self.conv_output_float.astype(np.float32)}
+        else:
+            self.outputs = {
+                'Output': convert_float_to_uint16(self.conv_output_float)
+            }
 
         if self.input_type is not np.float32:
             self.input = convert_float_to_uint16(self.input)
 
         self.inputs = {
-            'Input': self.input.view(self.input_type),
+            'Input': self.input,
             'Filter': OpTest.np_dtype_to_fluid_dtype(
                 self.filter.astype(self.weight_type))
         }
@@ -111,14 +119,18 @@ def test_check_grad_no_input(self):
 
     def init_test_case(self):
         TestConv2DOp.init_test_case(self)
-        self.input_size = [1, 1, 5, 5]  # NCHW
+        self.input_size = [1, 6, 12, 12]  # NCHW
         f_c = self.input_size[1] // self.groups
-        self.input_residual_size = [1, 2, 3, 3]
-        self.filter_size = [2, f_c, 3, 3]
+        o_c = 15
+        self.input_residual_size = [1, o_c, 10, 10]
+        self.filter_size = [o_c, f_c, 3, 3]
+
+    def init_padding(self):
+        pass
 
     def init_data_type(self):
         self.weight_type = np.float32
-        self.input_type = np.float32
+        self.input_type = np.uint16
 
     def init_force_fp32_output(self):
         self.force_fp32_output = False
@@ -130,7 +142,121 @@ def init_fuse_residual(self):
         self.fuse_residual = True
 
 
-class TestConv2D(TestConv2DBf16Op):
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestConv2DWithGradBF16Op(TestConv2DBF16Op):
+    def init_fuse_relu(self):
+        self.fuse_activation = None
+
+    def init_fuse_residual(self):
+        self.fuse_residual = None
+
+    def test_check_grad(self):
+        dout = self.conv_output_float
+        x = self.inputs_fp32['Input']
+        w = self.inputs_fp32['Filter']
+
+        dx, dweights = conv_backward(dout, x, w, self.conv2d_param)
+
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Input", "Filter"],
+            "Output",
+            user_defined_grads=[dx, dweights],
+            user_defined_grad_outputs=[convert_float_to_uint16(dout)])
+
+    def test_check_grad_no_filter(self):
+        dout = self.conv_output_float
+        x = self.inputs_fp32['Input']
+        w = self.inputs_fp32['Filter']
+
+        dx, _ = conv_backward(dout, x, w, self.conv2d_param)
+
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Input"],
+            "Output",
+            set(['Filter']),
+            user_defined_grads=[dx],
+            user_defined_grad_outputs=[convert_float_to_uint16(dout)])
+
+    def test_check_grad_no_input(self):
+        dout = self.conv_output_float
+        x = self.inputs_fp32['Input']
+        w = self.inputs_fp32['Filter']
+
+        _, dweights = conv_backward(dout, x, w, self.conv2d_param)
+
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Filter"],
+            "Output",
+            set(['Input']),
+            user_defined_grads=[dweights],
+            user_defined_grad_outputs=[convert_float_to_uint16(dout)])
+
+
+def conv_backward(dout, x, w, params):
+    padding = params['pad'][0]
+    stride = params['stride']
+
+    dx = np.zeros_like(x)
+    dweights = np.zeros_like(w)
+
+    N, IC, H, W = x.shape
+    OC, _, KH, KW = w.shape
+
+    H_out = int(1 + (H + 2 * padding - KH) / stride[0])
+    W_out = int(1 + (W + 2 * padding - KW) / stride[1])
+
+    x_padded = np.pad(x, ((0, ), (0, ), (padding, ), (padding, )), 'constant')
+
+    for n in range(N):
+        for oc in range(OC):
+            for i in range(KH):
+                for j in range(KW):
+                    for k in range(H_out):
+                        for l in range(W_out):
+                            for ic in range(IC):
+                                dweights[oc, ic, i, j] += x_padded[
+                                    n, ic, i + k * stride[0], j + l * stride[
+                                        1]] * dout[n, oc, k, l]
+
+    dx_padded = np.pad(dx, ((0, ), (0, ), (padding, ), (padding, )), 'constant')
+
+    w_ = np.zeros_like(w)
+    for i in range(KH):
+        for j in range(KW):
+            w_[:, :, i, j] = w[:, :, KH - i - 1, KW - j - 1]
+
+    for n in range(N):
+        for oc in range(OC):
+            for i in range(H_out):
+                for j in range(W_out):
+                    for kh in range(KH):
+                        for kw in range(KW):
+                            for ic in range(IC):
+                                dx_padded[n, ic, stride[0] * i + kh, stride[1] *
+                                          j + kw] += dout[n, oc, i, j] * w[
+                                              oc, ic, kh, kw]
+
+    if padding == 0:
+        dx = dx_padded
+    else:
+        dx = dx_padded[:, :, padding:-padding, padding:-padding]
+
+    return dx.astype(np.float32), dweights.astype(np.float32)
+
+
+class TestConv2DBF16WithPadding1(TestConv2DWithGradBF16Op):
+    def init_test_case(self):
+        TestConv2DWithGradBF16Op.init_test_case(self)
+        self.pad = [1, 1]
+
+
+class TestConv2DBF16WithStride2(TestConv2DWithGradBF16Op):
+    def init_test_case(self):
+        TestConv2DWithGradBF16Op.init_test_case(self)
+        self.stride = [2, 3]
+
+
+class TestConv2D(TestConv2DBF16Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -156,7 +282,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWithStride(TestConv2DBf16Op):
+class TestWithStride(TestConv2DBF16Op):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -170,7 +296,7 @@ def init_data_type(self):
         self.input_type = np.uint16
 
 
-class TestWithDilations(TestConv2DBf16Op):
+class TestWithDilations(TestConv2DBF16Op):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -185,7 +311,7 @@ def init_data_type(self):
         self.input_type = np.uint16
 
 
-class TestWith1x1ForceFP32Output(TestConv2DBf16Op):
+class TestWith1x1ForceFP32Output(TestConv2DBF16Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -201,7 +327,7 @@ def init_fuse_residual(self):
         self.fuse_residual = False
 
 
-class TestWithInput1x1Filter1x1(TestConv2DBf16Op):
+class TestWithInput1x1Filter1x1(TestConv2DBF16Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
index 5dd1795818c2b..25701b797ec4a 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -262,6 +262,41 @@ def config(self):
         self.trans_y = False
 
 
+class TestMatMulV2MatrixXMatrix4Dx3DTransposeXOneDNNOp(
+        TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (5, 4, 15, 10)
+        self.y_shape = (1, 15, 20)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMulV2MatrixXMatrix3Dx4DTransposeYOneDNNOp(
+        TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (2, 10, 15)
+        self.y_shape = (4, 2, 20, 15)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatMulV2MatrixXMatrix5Dx3DTransposeXTransposeYOneDNNOp(
+        TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (4, 3, 2, 15, 10)
+        self.y_shape = (1, 20, 15)
+        self.trans_x = True
+        self.trans_y = True
+
+
+class TestMatMulV2MatrixXMatrix3Dx4DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (1, 1, 32, 16)
+        self.y_shape = (16, 16, 16)
+        self.trans_x = False
+        self.trans_y = False
+
+
 #   BF16 TESTS
 def create_bf16_test_class(parent):
     @OpTestTool.skip_if_not_cpu_bf16()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
index 9f39826cb3ed2..a802ef4c61285 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
@@ -16,9 +16,6 @@
 
 import unittest
 import numpy as np
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
 from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 
@@ -66,6 +63,9 @@ class TestNearestInterpMKLDNNOp(OpTest):
     def init_test_case(self):
         pass
 
+    def init_data_type(self):
+        pass
+
     def setUp(self):
         self.op_type = "nearest_interp"
         self.interp_method = 'nearest'
@@ -73,6 +73,7 @@ def setUp(self):
         self.use_mkldnn = True
         self.input_shape = [1, 1, 2, 2]
         self.data_layout = 'NCHW'
+        self.dtype = np.float32
         # priority: actual_shape > out_size > scale > out_h & out_w
         self.out_h = 1
         self.out_w = 1
@@ -81,8 +82,15 @@ def setUp(self):
         self.actual_shape = None
 
         self.init_test_case()
+        self.init_data_type()
+
+        if self.dtype == np.float32:
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+        else:
+            init_low, init_high = (-5, 5) if self.dtype == np.int8 else (0, 10)
+            input_np = np.random.randint(init_low, init_high,
+                                         self.input_shape).astype(self.dtype)
 
-        input_np = np.random.random(self.input_shape).astype("float32")
         if self.data_layout == "NCHW":
             in_h = self.input_shape[2]
             in_w = self.input_shape[3]
@@ -162,6 +170,35 @@ def init_test_case(self):
         self.scale = 0.
 
 
+def create_test_class(parent):
+    class TestFp32Case(parent):
+        def init_data_type(self):
+            self.dtype = np.float32
+
+    class TestInt8Case(parent):
+        def init_data_type(self):
+            self.dtype = np.int8
+
+    class TestUint8Case(parent):
+        def init_data_type(self):
+            self.dtype = np.uint8
+
+    TestFp32Case.__name__ = parent.__name__
+    TestInt8Case.__name__ = parent.__name__
+    TestUint8Case.__name__ = parent.__name__
+    globals()[parent.__name__] = TestFp32Case
+    globals()[parent.__name__] = TestInt8Case
+    globals()[parent.__name__] = TestUint8Case
+
+
+create_test_class(TestNearestInterpMKLDNNOp)
+create_test_class(TestNearestInterpOpMKLDNNNHWC)
+create_test_class(TestNearestNeighborInterpMKLDNNCase2)
+create_test_class(TestNearestNeighborInterpCase3)
+create_test_class(TestNearestNeighborInterpCase4)
+create_test_class(TestNearestInterpOpMKLDNNNHWC)
+create_test_class(TestNearestNeighborInterpSame)
+
 if __name__ == "__main__":
     from paddle import enable_static
     enable_static()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
index b608ca3af2f36..24ebf40216f4b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -16,9 +16,6 @@
 
 import unittest
 import numpy as np
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
 from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 
@@ -66,6 +63,9 @@ class TestNearestInterpV2MKLDNNOp(OpTest):
     def init_test_case(self):
         pass
 
+    def init_data_type(self):
+        pass
+
     def setUp(self):
         self.op_type = "nearest_interp_v2"
         self.interp_method = 'nearest'
@@ -73,6 +73,7 @@ def setUp(self):
         self.use_mkldnn = True
         self.input_shape = [1, 1, 2, 2]
         self.data_layout = 'NCHW'
+        self.dtype = np.float32
         # priority: actual_shape > out_size > scale > out_h & out_w
         self.out_h = 1
         self.out_w = 1
@@ -81,8 +82,15 @@ def setUp(self):
         self.actual_shape = None
 
         self.init_test_case()
+        self.init_data_type()
+
+        if self.dtype == np.float32:
+            input_np = np.random.random(self.input_shape).astype(self.dtype)
+        else:
+            init_low, init_high = (-5, 5) if self.dtype == np.int8 else (0, 10)
+            input_np = np.random.randint(init_low, init_high,
+                                         self.input_shape).astype(self.dtype)
 
-        input_np = np.random.random(self.input_shape).astype("float32")
         if self.data_layout == "NCHW":
             in_h = self.input_shape[2]
             in_w = self.input_shape[3]
@@ -178,6 +186,34 @@ def init_test_case(self):
         self.out_size = np.array([65, 129]).astype("int32")
 
 
+def create_test_class(parent):
+    class TestFp32Case(parent):
+        def init_data_type(self):
+            self.dtype = np.float32
+
+    class TestInt8Case(parent):
+        def init_data_type(self):
+            self.dtype = np.int8
+
+    class TestUint8Case(parent):
+        def init_data_type(self):
+            self.dtype = np.uint8
+
+    TestFp32Case.__name__ = parent.__name__
+    TestInt8Case.__name__ = parent.__name__
+    TestUint8Case.__name__ = parent.__name__
+    globals()[parent.__name__] = TestFp32Case
+    globals()[parent.__name__] = TestInt8Case
+    globals()[parent.__name__] = TestUint8Case
+
+
+create_test_class(TestNearestInterpV2MKLDNNOp)
+create_test_class(TestNearestInterpOpV2MKLDNNNHWC)
+create_test_class(TestNearestNeighborInterpV2MKLDNNCase2)
+create_test_class(TestNearestNeighborInterpV2MKLDNNCase3)
+create_test_class(TestNearestNeighborInterpV2MKLDNNCase4)
+create_test_class(TestNearestNeighborInterpV2MKLDNNSame)
+
 if __name__ == "__main__":
     from paddle import enable_static
     enable_static()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
new file mode 100644
index 0000000000000..7d6c3b9bdb444
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+paddle.enable_static()
+
+
+class TestPadOp(OpTest):
+    def setUp(self):
+        self.op_type = "pad"
+        self.set_npu()
+        self.init_dtype()
+        self.initTestCase()
+
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype), }
+        self.attrs = {}
+        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        self.attrs['pad_value'] = self.pad_value
+        self.outputs = {
+            'Out': np.pad(self.inputs['X'],
+                          self.paddings,
+                          mode='constant',
+                          constant_values=self.pad_value)
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
+
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.paddings = [(1, 1), (2, 3)]
+        self.pad_value = 0.0
+
+
+class TestCase1(TestPadOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.paddings = [(0, 1), (2, 3), (2, 1), (1, 1)]
+        self.pad_value = 0.0
+
+
+class TestCase2(TestPadOp):
+    def initTestCase(self):
+        self.shape = (5, 5, 5)
+        self.paddings = [(0, 0), (0, 0), (1, 2)]
+        self.pad_value = 0.0
+
+
+class TestCase3(TestPadOp):
+    def initTestCase(self):
+        self.shape = (100)
+        self.paddings = [(0, 1)]
+        self.pad_value = 0.0
+
+
+#----------------Pad Fp16----------------
+
+
+def create_test_fp16(parent):
+    class TestPadFp16(parent):
+        def init_dtype(self):
+            self.dtype = np.float16
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestPadFp16.__name__ = cls_name
+    globals()[cls_name] = TestPadFp16
+
+
+create_test_fp16(TestPadOp)
+create_test_fp16(TestCase1)
+create_test_fp16(TestCase2)
+create_test_fp16(TestCase3)
+
+
+class TestPadOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            input_data = np.random.random((2, 2)).astype("float32")
+
+            def test_Variable():
+                fluid.layers.pad(x=input_data, paddings=[1, 1, 1, 1])
+
+            self.assertRaises(TypeError, test_Variable)
+
+            data = fluid.data(name='data', shape=[4], dtype='float16')
+            fluid.layers.pad(x=data, paddings=[0, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index a3e1650c131cd..ec59c27558332 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -139,8 +139,8 @@ def product(dim):
     elif tensor_to_check_dtype == core.VarDesc.VarType.COMPLEX128:
         tensor_tp_check_dtype = np.complex128
     else:
-        raise ValueError("Not supported data type " + str(
-            tensor_to_check_dtype))
+        raise ValueError("Not supported data type " + str(tensor_to_check_dtype)
+                         + ", tensor name : " + str(input_to_check))
 
     def get_output():
         sum = []
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index c1d7802633eca..d3d8fdd703148 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2837,6 +2837,86 @@ def test_errors(self):
             F.swish(x_fp16)
 
 
+def ref_mish(x, threshold=20.):
+    softplus = np.select([x <= threshold, x > threshold],
+                         [np.log(1 + np.exp(x)), x])
+    return x * np.tanh(softplus)
+
+
+class TestMish(TestActivation):
+    def setUp(self):
+        self.op_type = "mish"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_mish(x)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+
+class TestMishAPI(unittest.TestCase):
+    # test paddle.nn.Mish, paddle.nn.functional.mish
+    def setUp(self):
+        np.random.seed(1024)
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.mish(x)
+            mish = paddle.nn.Mish()
+            out2 = mish(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_mish(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.mish(x)
+        mish = paddle.nn.Mish()
+        out2 = mish(x)
+        out_ref = ref_mish(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.mish(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_mish(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.mish, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.mish, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
+            F.mish(x_fp16)
+
+
 #------------------ Test Error Activation----------------------
 def create_test_error_class(op_type):
     class TestOpErrors(unittest.TestCase):
@@ -2972,6 +3052,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestHardSigmoid)
 create_test_act_fp16_class(TestSwish, grad_atol=0.85)
 create_test_act_fp16_class(TestHardSwish)
+create_test_act_fp16_class(TestMish, grad_atol=0.9)
 
 
 def create_test_act_bf16_class(parent,
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 70109164960a3..a06f0d390e517 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -1011,5 +1011,186 @@ def test_adam_op(self):
         adam.clear_gradients()
 
 
+class TestMultiTensorAdam(unittest.TestCase):
+    def _adam_optimize_dygraph(self,
+                               place,
+                               use_param_attr=False,
+                               use_param_group=False,
+                               use_amp=False,
+                               use_multi_tensor=False):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device(place)
+
+        input = paddle.randn((5, 5))
+
+        weight_attr = paddle.ParamAttr(
+            learning_rate=0.5,
+            regularizer=paddle.regularizer.L2Decay(1.0),
+            trainable=True)
+        if use_param_attr:
+            model = paddle.nn.Linear(5, 5, weight_attr)
+        else:
+            model = paddle.nn.Linear(5, 5)
+
+        if not use_param_group:
+            optimizer = paddle.optimizer.Adam(
+                parameters=model.parameters(),
+                use_multi_tensor=use_multi_tensor,
+                multi_precision=use_amp)
+        else:
+            optimizer = paddle.optimizer.Adam(
+                parameters=[{
+                    'params': model.parameters(),
+                    'weight_decay': 0.001,
+                    'beta1': 0.1,
+                    'beta2': 0.99
+                }],
+                use_multi_tensor=use_multi_tensor,
+                multi_precision=use_amp)
+
+        for idx in range(2):
+            if place == 'gpu' and use_amp == True:
+                model = paddle.amp.decorate(models=model, level='O2')
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+            if place == 'gpu' and use_amp == True:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.step(optimizer)
+                optimizer.clear_grad()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                loss.backward()
+                optimizer.step()
+                optimizer.clear_grad()
+
+        return output, model.parameters()
+
+    def _adam_optimize_static(self,
+                              place,
+                              use_amp=False,
+                              use_multi_tensor=False):
+        paddle.enable_static()
+        paddle.seed(10)
+        np.random.seed(10)
+        if place == 'cpu':
+            use_amp = False
+        exe = paddle.static.Executor(place=place)
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        optimizer = paddle.optimizer.Adam(
+            multi_precision=use_amp, use_multi_tensor=use_multi_tensor)
+        if use_amp:
+            optimizer = paddle.static.amp.decorate(
+                optimizer,
+                init_loss_scaling=128.0,
+                use_dynamic_loss_scaling=True,
+                use_pure_fp16=True,
+                use_fp16_guard=False)
+        with paddle.static.program_guard(train_program, startup_program):
+            if use_amp:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16')
+            else:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32')
+            hidden = paddle.static.nn.fc(x=data, size=10)
+            loss = paddle.fluid.layers.mean(hidden)
+            optimizer.minimize(loss)
+        exe.run(startup_program)
+        if use_amp:
+            optimizer.amp_init(place=place, scope=paddle.static.global_scope())
+            x = np.random.random(size=(2, 2)).astype('float16')
+        else:
+            x = np.random.random(size=(2, 2)).astype('float32')
+        out = []
+        for idx in range(5):
+            loss_data, = exe.run(train_program,
+                                 feed={"X": x},
+                                 fetch_list=[loss.name])
+            out.append(loss_data)
+        return out
+
+    def _get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        return places
+
+    def _check_with_place_amp(self, place, use_amp):
+        # test dygraph mode
+        output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
+            place=place, use_amp=use_amp, use_multi_tensor=True)
+        output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
+            place=place, use_amp=use_amp, use_multi_tensor=False)
+        self.assertEqual(
+            np.allclose(
+                output_dygraph1, output_dygraph2, rtol=1e-05), True)
+        for idx in range(len(params_dygraph1)):
+            self.assertEqual(
+                np.allclose(
+                    params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05),
+                True)
+        # test static mode
+        output_static1 = self._adam_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=True)
+        output_static2 = self._adam_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=False)
+        for idx in range(len(output_static1)):
+            self.assertEqual(
+                np.allclose(
+                    output_static1[idx], output_static2[idx], rtol=1e-05),
+                True)
+
+    def _check_with_param_arrt(self, place, use_amp):
+        output1, params1 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_attr=True,
+            use_multi_tensor=True)
+        output2, params2 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_attr=True,
+            use_multi_tensor=False)
+
+        self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
+        for idx in range(len(params1)):
+            self.assertEqual(
+                np.allclose(
+                    params1[idx], params2[idx], rtol=1e-05), True)
+
+    def _check_with_param_group(self, place, use_amp):
+        output1, params1 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_group=True,
+            use_multi_tensor=True)
+        output2, params2 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_group=True,
+            use_multi_tensor=False)
+
+        self.assertEqual(np.allclose(output1, output2, rtol=1e-05), True)
+        for idx in range(len(params1)):
+            self.assertEqual(
+                np.allclose(
+                    params1[idx], params2[idx], rtol=1e-05), True)
+
+    def test_main(self):
+        for place in self._get_places():
+            use_amp_list = [True, False]
+            for use_amp in use_amp_list:
+                self._check_with_place_amp(place, use_amp)
+                self._check_with_param_arrt(place, use_amp)
+                self._check_with_param_group(place, use_amp)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index 7513d8810e61a..3dbd9311a71ed 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -169,6 +169,31 @@ def test_assign_BasicTypes(self):
         self.assertTrue(np.allclose(result3.numpy(), np.array([1])))
         paddle.enable_static()
 
+    def test_clone(self):
+        paddle.disable_static()
+        x = paddle.ones([2])
+        x.stop_gradient = False
+        clone_x = paddle.clone(x)
+
+        y = clone_x**3
+        y.backward()
+
+        self.assertTrue(np.array_equal(x, [1, 1]), True)
+        self.assertTrue(np.array_equal(clone_x.grad.numpy(), [3, 3]), True)
+        self.assertTrue(np.array_equal(x.grad.numpy(), [3, 3]), True)
+        paddle.enable_static()
+
+        with program_guard(Program(), Program()):
+            x_np = np.random.randn(2, 3).astype('float32')
+            x = paddle.static.data("X", shape=[2, 3])
+            clone_x = paddle.clone(x)
+            exe = paddle.static.Executor()
+            y_np = exe.run(paddle.static.default_main_program(),
+                           feed={'X': x_np},
+                           fetch_list=[clone_x])[0]
+
+        self.assertTrue(np.array_equal(y_np, x_np), True)
+
 
 class TestAssignOpErrorApi(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index d58c79dd72cb8..83254de61298b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -26,7 +26,7 @@
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
-from paddle.distributed.auto_parallel.completion import complete_backward_annotation
+from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.reshard import reshard
 from paddle.distributed.auto_parallel.cost_model import estimate_cost
 import paddle.fluid.core as core
@@ -148,22 +148,30 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
-    dist_strategy = fleet.DistributedStrategy()
+    fleet._user_defined_strategy = fleet.DistributedStrategy()
+    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    parallelizer = AutoParallelizer(fleet)
+    parallelizer._dist_context = dist_context
 
-    # auto completion
+    # serial forward & backward completion
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
-    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+
+    params_grads = parallelizer._generate_backward(
+        complete_train_program,
+        startup_program,
+        loss,
+        parameter_list=None,
+        no_grad_set=None,
+        callbacks=None)
+
     # logical partition
-    auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
-        complete_train_program, startup_program)
-    dist_params_grads = partitioner.apply_backward(
-        loss, complete_train_program, startup_program, auto_parallel_main_prog,
-        auto_parallel_startup_prog)
-    optimizer = paddle.fluid.optimizer.AdamOptimizer()
-    opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
-                                         auto_parallel_main_prog,
-                                         auto_parallel_startup_prog)
+    partitioner = Partitioner(dist_context, rank_id)
+    auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads = partitioner.partition(
+        complete_train_program, startup_program, params_grads)
+
+    partitioned_optimize_ops = parallelizer._apply_optimize(
+        auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads)
 
     return auto_parallel_main_prog, auto_parallel_startup_prog
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 4fd64dc252bcd..3a28595c833e0 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -36,6 +36,7 @@
 from paddle.distributed import fleet
 
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import reshard
@@ -469,21 +470,30 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
-    dist_strategy = fleet.DistributedStrategy()
+    fleet._user_defined_strategy = fleet.DistributedStrategy()
+    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    parallelizer = AutoParallelizer(fleet)
+    parallelizer._dist_context = dist_context
 
     # auto completion
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
-    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
-    # logical partition
-    dist_train_program, dist_startup_prog = partitioner.transpile_forward(
-        complete_train_program, startup_program)
-    dist_params_grads = partitioner.apply_backward(
-        loss, complete_train_program, startup_program, dist_train_program,
-        dist_startup_prog)
-    optimizer = paddle.fluid.optimizer.AdamOptimizer()
-    opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
-                                         dist_train_program, dist_startup_prog)
+
+    params_grads = parallelizer._generate_backward(
+        complete_train_program,
+        startup_program,
+        loss,
+        parameter_list=None,
+        no_grad_set=None,
+        callbacks=None)
+
+    partitioner = Partitioner(dist_context, rank_id)
+    dist_train_program, dist_startup_prog, dist_params_grads = partitioner.partition(
+        complete_train_program, startup_program, params_grads)
+
+    partitioned_optimize_ops = parallelizer._apply_optimize(
+        dist_train_program, dist_startup_prog, dist_params_grads)
+
     reshard(dist_train_program, dist_startup_prog, rank_id, dist_context)
     return dist_train_program, dist_startup_prog
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
old mode 100755
new mode 100644
index 3a23f9b2611dc..21cf8a904b690
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -54,9 +54,9 @@ def get_programs(annotated_func):
 
     rank_id = 3
     dist_strategy = fleet.DistributedStrategy()
-    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
-    test_auto_parallel_dist_main_prog, test_auto_parallel_dist_startup_prog = partitioner.transpile_forward(
-        complete_train_program, start_program)
+    partitioner = Partitioner(dist_context, rank_id)
+    test_auto_parallel_dist_main_prog, test_auto_parallel_dist_startup_prog, _ = partitioner.partition(
+        complete_train_program, start_program, [])
 
     return complete_train_program, start_program, test_auto_parallel_dist_main_prog, test_auto_parallel_dist_startup_prog, dist_context
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
old mode 100755
new mode 100644
index 7fcb18db12817..dc2ad1d900f52
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -35,6 +35,7 @@
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.utils import _get_comm_group
 from paddle.distributed.auto_parallel.process_group import new_process_group
 
@@ -790,9 +791,9 @@ def forward(self, prediction_scores, masked_lm_labels, loss_mask):
         return loss
 
 
-def gpt_pretrain_forward(train_program, start_program):
+def gpt_pretrain_forward(train_program, startup_program):
     with static.program_guard(train_program,
-                              start_program), utils.unique_name.guard():
+                              startup_program), utils.unique_name.guard():
         batch_size = 16
         sequence_len = 512
         input_ids = static.data(
@@ -848,7 +849,19 @@ def gpt_pretrain_forward(train_program, start_program):
 
         loss = criterion(preds, labels, loss_mask)
 
-    return train_program, start_program, loss
+    return train_program, startup_program, loss
+
+
+class FakeStrategy(object):
+    def __init__(self):
+        self.amp = False
+        self.recompute = False
+
+
+class FakeFleet(object):
+    def __init__(self):
+        self.user_defined_optimizer = None
+        self._user_defined_strategy = FakeStrategy()
 
 
 class TestGPTPartitioner(unittest.TestCase):
@@ -861,38 +874,37 @@ def test_gpt_dp_mp(self):
             mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
 
         train_program = static.Program()
-        start_program = static.Program()
-        dist_context = DistributedContext()
+        startup_program = static.Program()
+        parallelizer = AutoParallelizer(FakeFleet())
+        dist_context = parallelizer._dist_context
+
         dist_context.process_mesh = _global_process_mesh
-        train_program, start_program, loss = gpt_pretrain_forward(train_program,
-                                                                  start_program)
+        train_program, startup_program, loss = gpt_pretrain_forward(
+            train_program, startup_program)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
+
+        # serial backward pass
+        params_grads = parallelizer._generate_backward(
+            complete_train_program,
+            startup_program,
+            loss,
+            parameter_list=None,
+            no_grad_set=None,
+            callbacks=None)
+
         rank_id = 3
-        dist_strategy = fleet.DistributedStrategy()
-        partitioner = Partitioner(dist_strategy, dist_context, rank_id)
-        auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
-            complete_train_program, start_program)
-        dist_params_grads = partitioner.apply_backward(
-            loss, complete_train_program, start_program,
-            auto_parallel_main_prog, auto_parallel_startup_prog)
+        partitioner = Partitioner(dist_context, rank_id)
+        auto_parallel_main_prog, auto_parallel_startup_prog, params_grads = partitioner.partition(
+            complete_train_program, startup_program, params_grads)
 
         with open("./test_auto_parallel_partitioner_serial_main_new.txt",
                   "w") as fw:
             fw.write(str(train_program))
         with open("./test_auto_parallel_partitioner_serial_startup_new.txt",
                   "w") as fw:
-            fw.write(str(start_program))
-
-        optimizer = paddle.fluid.optimizer.AdamOptimizer(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
-        opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
-                                             auto_parallel_main_prog,
-                                             auto_parallel_startup_prog)
+            fw.write(str(startup_program))
+
         from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
         set_default_distributed_context(dist_context)
         with open("./test_auto_parallel_partitioner_main_new.txt1", "w") as fw:
@@ -927,7 +939,7 @@ def test_gpt_dp_mp(self):
                                complete_train_program, weights, 0, 1))
 
         all_params = sorted(
-            [param.name for param in start_program.all_parameters()])
+            [param.name for param in startup_program.all_parameters()])
         allreduce_grads = [
             'layer_norm_5.tmp_2', 'layer_norm_5.tmp_2', 'layer_norm_5.tmp_2',
             'layer_norm_6.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_7.tmp_2',
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 0439b9a287cf6..614b996d26521 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -24,6 +24,7 @@
 import paddle.distributed.auto_parallel as auto
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import reshard
 from paddle.distributed.auto_parallel.process_group import _g_process_group_map
@@ -145,22 +146,31 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
-    # auto completion
+    fleet._user_defined_strategy = fleet.DistributedStrategy()
+    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    parallelizer = AutoParallelizer(fleet)
+    parallelizer._dist_context = dist_context
+
+    # serial forward & backward completion
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
 
-    dist_strategy = fleet.DistributedStrategy()
-    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+    params_grads = parallelizer._generate_backward(
+        complete_train_program,
+        startup_program,
+        loss,
+        parameter_list=None,
+        no_grad_set=None,
+        callbacks=None)
+
     # logical partition
-    auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
-        complete_train_program, startup_program)
-    dist_params_grads = partitioner.apply_backward(
-        loss, complete_train_program, startup_program, auto_parallel_main_prog,
-        auto_parallel_startup_prog)
-    optimizer = paddle.fluid.optimizer.AdamOptimizer()
-    opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
-                                         auto_parallel_main_prog,
-                                         auto_parallel_startup_prog)
+    partitioner = Partitioner(dist_context, rank_id)
+    auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads = partitioner.partition(
+        complete_train_program, startup_program, params_grads)
+
+    partitioned_optimize_ops = parallelizer._apply_optimize(
+        auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads)
+
     return auto_parallel_main_prog, auto_parallel_startup_prog
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index 4bd03a3e1bd92..cfbb7653fad8e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -24,6 +24,7 @@
 import paddle.distributed.auto_parallel as auto
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import reshard
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
@@ -109,22 +110,31 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
-    # auto completion
+    fleet._user_defined_strategy = fleet.DistributedStrategy()
+    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    parallelizer = AutoParallelizer(fleet)
+    parallelizer._dist_context = dist_context
+
+    # serial forward & backward completion
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
 
-    dist_strategy = fleet.DistributedStrategy()
-    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+    params_grads = parallelizer._generate_backward(
+        complete_train_program,
+        startup_program,
+        loss,
+        parameter_list=None,
+        no_grad_set=None,
+        callbacks=None)
+
     # logical partition
-    auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
-        complete_train_program, startup_program)
-    dist_params_grads = partitioner.apply_backward(
-        loss, complete_train_program, startup_program, auto_parallel_main_prog,
-        auto_parallel_startup_prog)
-    optimizer = paddle.fluid.optimizer.AdamOptimizer()
-    opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
-                                         auto_parallel_main_prog,
-                                         auto_parallel_startup_prog)
+    partitioner = Partitioner(dist_context, rank_id)
+    auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads = partitioner.partition(
+        complete_train_program, startup_program, params_grads)
+
+    partitioned_optimize_ops = parallelizer._apply_optimize(
+        auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads)
+
     return auto_parallel_main_prog, auto_parallel_startup_prog
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index ae79712dc7936..272c1c212f08e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -24,6 +24,7 @@
 import paddle.distributed.auto_parallel as auto
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.reshard import reshard
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
@@ -125,22 +126,30 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     loss, train_program, startup_program = mlp_forward(train_program,
                                                        startup_program)
 
-    # auto completion
+    fleet._user_defined_strategy = fleet.DistributedStrategy()
+    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    parallelizer = AutoParallelizer(fleet)
+    parallelizer._dist_context = dist_context
+
+    # serial forward & backward completion
     complete_train_program = auto.complete_annotation(train_program,
                                                       dist_context)
 
-    dist_strategy = fleet.DistributedStrategy()
-    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+    params_grads = parallelizer._generate_backward(
+        complete_train_program,
+        startup_program,
+        loss,
+        parameter_list=None,
+        no_grad_set=None,
+        callbacks=None)
+
     # logical partition
-    auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
-        complete_train_program, startup_program)
-    dist_params_grads = partitioner.apply_backward(
-        loss, complete_train_program, startup_program, auto_parallel_main_prog,
-        auto_parallel_startup_prog)
-    optimizer = paddle.fluid.optimizer.AdamOptimizer()
-    opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
-                                         auto_parallel_main_prog,
-                                         auto_parallel_startup_prog)
+    partitioner = Partitioner(dist_context, rank_id)
+    auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads = partitioner.partition(
+        complete_train_program, startup_program, params_grads)
+
+    partitioned_optimize_ops = parallelizer._apply_optimize(
+        auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads)
     return auto_parallel_main_prog, auto_parallel_startup_prog
 
 
@@ -253,14 +262,15 @@ def test_allgather(self):
         rank_id = 0
         dist_context = DistributedContext()
         dist_strategy = fleet.DistributedStrategy()
-        partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+        partitioner = Partitioner(dist_context, rank_id)
         complete_train_program = auto.complete_annotation(train_program,
                                                           dist_context)
-        auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
-            complete_train_program, startup_program)
-        reshard(auto_parallel_main_prog, startup_program, rank_id, dist_context)
+        partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition(
+            complete_train_program, startup_program, [])
+        reshard(partitioned_main_prog, partitioned_startup_prog, rank_id,
+                dist_context)
         # the x should not be slice
-        self.assertTrue(check_allgather(auto_parallel_main_prog))
+        self.assertTrue(check_allgather(partitioned_main_prog))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
old mode 100644
new mode 100755
index 92d11801902a0..ed64fa0630fa1
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
@@ -212,41 +212,6 @@ def test_enumerater_and_checker(self):
         self.assertTrue(
             check_nonpipeline_enumerater(train_program, process_mesh_topology))
 
-    def test_get_dist_programs(self):
-        train_program = paddle.static.Program()
-        startup_program = paddle.static.Program()
-        loss, train_program, startup_program = mlp_forward(train_program,
-                                                           startup_program)
-        process_mesh_topology = [4]
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=0.00001,
-            beta1=0.9,
-            beta2=0.999,
-            epsilon=1e-08,
-            grad_clip=None)
-        valid_dist_attr_dict, pipeline_process_meshes, global_process_mesh = PlanSpace.enum_valid_dist_attr_for_program(
-            train_program, process_mesh_topology, False)
-        from test_auto_parallel_cluster import cluster_json
-        cluster_json_file = ""
-        cluster_json_object = json.loads(cluster_json)
-        with open("./auto_parallel_cluster.json", "w") as cluster_json_file:
-            json.dump(cluster_json_object, cluster_json_file)
-        cluster = Cluster()
-        cluster.build_from_file("./auto_parallel_cluster.json")
-        os.remove("./auto_parallel_cluster.json")
-
-        ops = train_program.global_block().ops
-        vars = train_program.global_block().vars
-        new_dist_context = DistributedContext()
-        set_default_dist_attr(train_program, new_dist_context,
-                              global_process_mesh)
-
-        serial_program_info = SerialProgramInfo(train_program, startup_program,
-                                                loss, optimizer, cluster)
-        result = get_all_distributed_main_program(serial_program_info,
-                                                  new_dist_context)
-        self.assertEqual(len(result), 4)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
index 12a29de804266..471caeb77bf65 100644
--- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py
@@ -32,18 +32,14 @@ class TestBernoulliOp(OpTest):
     def setUp(self):
         self.op_type = "bernoulli"
         self.inputs = {"X": np.random.uniform(size=(1000, 784))}
-        self.init_attrs()
-        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
-
-    def init_attrs(self):
         self.attrs = {}
-        self.output_hist = output_hist
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("float32")}
 
     def test_check_output(self):
         self.check_output_customized(self.verify_output)
 
     def verify_output(self, outs):
-        hist, prob = self.output_hist(np.array(outs[0]))
+        hist, prob = output_hist(np.array(outs[0]))
         self.assertTrue(
             np.allclose(
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
new file mode 100644
index 0000000000000..c31594b75e985
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_solve_op.py
@@ -0,0 +1,262 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.w
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import scipy
+import scipy.linalg
+
+import sys
+sys.path.append("..")
+import paddle
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard, core
+
+paddle.enable_static()
+
+
+def cholesky_solution(X, B, upper=True):
+    if upper:
+        A = np.triu(X)
+        L = A.T
+        U = A
+    else:
+        A = np.tril(X)
+        L = A
+        U = A.T
+    return scipy.linalg.solve_triangular(
+        U, scipy.linalg.solve_triangular(
+            L, B, lower=True))
+
+
+def scipy_cholesky_solution(X, B, upper=True):
+    if upper:
+        umat = np.triu(X)
+        A = umat.T @umat
+    else:
+        umat = np.tril(X)
+        A = umat @umat.T
+    K = scipy.linalg.cho_factor(A)
+    return scipy.linalg.cho_solve(K, B)
+
+
+def boardcast_shape(matA, matB):
+    shapeA = matA.shape
+    shapeB = matB.shape
+    Boardshape = []
+    for idx in range(len(shapeA) - 2):
+        if shapeA[idx] == shapeB[idx]:
+            Boardshape.append(shapeA[idx])
+            continue
+        elif shapeA[idx] == 1 or shapeB[idx] == 1:
+            Boardshape.append(max(shapeA[idx], shapeB[idx]))
+        else:
+            raise Exception(
+                'shapeA and shapeB should be boardcasted, but got {} and {}'.
+                format(shapeA, shapeB))
+    bsA = Boardshape + list(shapeA[-2:])
+    bsB = Boardshape + list(shapeB[-2:])
+    return np.broadcast_to(matA, bsA), np.broadcast_to(matB, bsB)
+
+
+def scipy_cholesky_solution_batch(bumat, bB, upper=True):
+    bumat, bB = boardcast_shape(bumat, bB)
+    ushape = bumat.shape
+    bshape = bB.shape
+    bumat = bumat.reshape((-1, ushape[-2], ushape[-1]))
+    bB = bB.reshape((-1, bshape[-2], bshape[-1]))
+    batch = 1
+    for d in ushape[:-2]:
+        batch *= d
+    bx = []
+    for b in range(batch):
+        # x = scipy_cholesky_solution(bumat[b], bB[b], upper)   #large matrix result error 
+        x = cholesky_solution(bumat[b], bB[b], upper)
+        bx.append(x)
+    return np.array(bx).reshape(bshape)
+
+
+# 2D + 2D , , upper=False
+class TestCholeskySolveOp(OpTest):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.y_shape = [15, 15]
+        self.x_shape = [15, 5]
+        self.upper = False
+        self.dtype = np.float64
+
+    def set_output(self):
+        umat = self.inputs['Y']
+        self.output = scipy_cholesky_solution_batch(
+            umat, self.inputs['X'], upper=self.upper)
+
+    def setUp(self):
+        self.op_type = "cholesky_solve"
+        self.config()
+
+        if self.upper:
+            umat = np.triu(np.random.random(self.y_shape).astype(self.dtype))
+        else:
+            umat = np.tril(np.random.random(self.y_shape).astype(self.dtype))
+
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': umat
+        }
+        self.attrs = {'upper': self.upper}
+        self.set_output()
+        self.outputs = {'Out': self.output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['Y'], 'Out', max_relative_error=0.01)
+
+
+# 3D(broadcast) + 3D, upper=True
+class TestCholeskySolveOp3(TestCholeskySolveOp):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.y_shape = [1, 10, 10]
+        self.x_shape = [2, 10, 5]
+        self.upper = True
+        self.dtype = np.float64
+
+
+class TestCholeskySolveAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2021)
+        self.place = [paddle.CPUPlace()]
+        # self.place = [paddle.CUDAPlace(0)]
+        self.dtype = "float64"
+        self.upper = True
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.data(name="x", shape=[10, 2], dtype=self.dtype)
+            y = fluid.data(name="y", shape=[10, 10], dtype=self.dtype)
+            z = paddle.linalg.cholesky_solve(x, y, upper=self.upper)
+
+            x_np = np.random.random([10, 2]).astype(self.dtype)
+            y_np = np.random.random([10, 10]).astype(self.dtype)
+            if self.upper:
+                umat = np.triu(y_np)
+            else:
+                umat = np.tril(y_np)
+            z_np = cholesky_solution(umat, x_np, upper=self.upper)
+            z2_np = scipy_cholesky_solution(umat, x_np, upper=self.upper)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"x": x_np,
+                                    "y": umat},
+                              fetch_list=[z])
+            self.assertTrue(np.allclose(fetches[0], z_np))
+
+    def test_static(self):
+        for place in self.place:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_np = np.random.random([20, 2]).astype(self.dtype)
+            y_np = np.random.random([20, 20]).astype(self.dtype)
+            z_np = scipy_cholesky_solution(y_np, x_np, upper=self.upper)
+
+            x = paddle.to_tensor(x_np)
+            y = paddle.to_tensor(y_np)
+            z = paddle.linalg.cholesky_solve(x, y, upper=self.upper)
+
+            self.assertTrue(np.allclose(z_np, z.numpy()))
+            self.assertEqual(z_np.shape, z.numpy().shape)
+            paddle.enable_static()
+
+        for idx, place in enumerate(self.place):
+            run(place)
+
+    def test_boardcast(self):
+        def run(place):
+            paddle.disable_static()
+            x_np = np.random.random([1, 30, 2]).astype(self.dtype)
+            y_np = np.random.random([2, 30, 30]).astype(self.dtype)
+            nx_np = np.concatenate((x_np, x_np), axis=0)
+
+            z_sci = scipy_cholesky_solution_batch(y_np, nx_np, upper=self.upper)
+
+            x = paddle.to_tensor(x_np)
+            y = paddle.to_tensor(y_np)
+            z = paddle.linalg.cholesky_solve(x, y, upper=self.upper)
+            self.assertEqual(z_sci.shape, z.numpy().shape)
+            self.assertTrue(np.allclose(z_sci, z.numpy()))
+
+        for idx, place in enumerate(self.place):
+            run(place)
+
+
+class TestCholeskySolveOpError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            # The input type of solve_op must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x1, y1)
+
+            # The data type of input must be float32 or float64.        
+            x2 = fluid.data(name="x2", shape=[30, 30], dtype="bool")
+            y2 = fluid.data(name="y2", shape=[30, 10], dtype="bool")
+            self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x2, y2)
+
+            x3 = fluid.data(name="x3", shape=[30, 30], dtype="int32")
+            y3 = fluid.data(name="y3", shape=[30, 10], dtype="int32")
+            self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x3, y3)
+
+            x4 = fluid.data(name="x4", shape=[30, 30], dtype="float16")
+            y4 = fluid.data(name="y4", shape=[30, 10], dtype="float16")
+            self.assertRaises(TypeError, paddle.linalg.cholesky_solve, x4, y4)
+
+            # The number of dimensions of input'X must be >= 2.
+            x5 = fluid.data(name="x5", shape=[30], dtype="float64")
+            y5 = fluid.data(name="y5", shape=[30, 30], dtype="float64")
+            self.assertRaises(ValueError, paddle.linalg.cholesky_solve, x5, y5)
+
+            # The number of dimensions of input'Y must be >= 2.
+            x6 = fluid.data(name="x6", shape=[30, 30], dtype="float64")
+            y6 = fluid.data(name="y6", shape=[30], dtype="float64")
+            self.assertRaises(ValueError, paddle.linalg.cholesky_solve, x6, y6)
+
+            # The inner-most 2 dimensions of input'X should be equal to each other
+            x7 = fluid.data(name="x7", shape=[2, 3, 4], dtype="float64")
+            y7 = fluid.data(name="y7", shape=[2, 4, 3], dtype="float64")
+            self.assertRaises(ValueError, paddle.linalg.cholesky_solve, x7, y7)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cov.py b/python/paddle/fluid/tests/unittests/test_cov.py
new file mode 100644
index 0000000000000..93ecf13bdcbe7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cov.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import six
+import paddle
+
+
+def numpy_cov(np_arr, rowvar=True, ddof=1, fweights=None, aweights=None):
+    return np.cov(np_arr,
+                  rowvar=rowvar,
+                  ddof=int(ddof),
+                  fweights=fweights,
+                  aweights=aweights)
+
+
+class Cov_Test(unittest.TestCase):
+    def setUp(self):
+        self.shape = [20, 10]
+        self.weightshape = [10]
+
+    def test_tensor_cov_default(self):
+        typelist = ['float64']
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for idx, p in enumerate(places):
+            if idx == 0:
+                paddle.set_device('cpu')
+            else:
+                paddle.set_device('gpu')
+
+            for dtype in typelist:
+                np_arr = np.random.rand(*self.shape).astype(dtype)
+                tensor = paddle.to_tensor(np_arr, place=p)
+                cov = paddle.linalg.cov(tensor,
+                                        rowvar=True,
+                                        ddof=True,
+                                        fweights=None,
+                                        aweights=None)
+                np_cov = numpy_cov(
+                    np_arr, rowvar=True, ddof=1, fweights=None, aweights=None)
+                self.assertTrue(np.allclose(np_cov, cov.numpy()))
+
+    def test_tensor_cov_rowvar(self):
+        typelist = ['float64']
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for idx, p in enumerate(places):
+            if idx == 0:
+                paddle.set_device('cpu')
+            else:
+                paddle.set_device('gpu')
+
+            for dtype in typelist:
+                np_arr = np.random.rand(*self.shape).astype(dtype)
+                tensor = paddle.to_tensor(np_arr, place=p)
+                cov = paddle.linalg.cov(tensor,
+                                        rowvar=False,
+                                        ddof=True,
+                                        fweights=None,
+                                        aweights=None)
+                np_cov = numpy_cov(
+                    np_arr, rowvar=False, ddof=1, fweights=None, aweights=None)
+                self.assertTrue(np.allclose(np_cov, cov.numpy()))
+
+    def test_tensor_cov_ddof(self):
+        typelist = ['float64']
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for idx, p in enumerate(places):
+            if idx == 0:
+                paddle.set_device('cpu')
+            else:
+                paddle.set_device('gpu')
+
+            for dtype in typelist:
+                np_arr = np.random.rand(*self.shape).astype(dtype)
+                tensor = paddle.to_tensor(np_arr, place=p)
+                cov = paddle.linalg.cov(tensor,
+                                        rowvar=True,
+                                        ddof=False,
+                                        fweights=None,
+                                        aweights=None)
+                np_cov = numpy_cov(
+                    np_arr, rowvar=True, ddof=0, fweights=None, aweights=None)
+                self.assertTrue(np.allclose(np_cov, cov.numpy()))
+
+    def test_tensor_cov_fweights(self):
+        typelist = ['float64']
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for idx, p in enumerate(places):
+            if idx == 0:
+                paddle.set_device('cpu')
+            else:
+                paddle.set_device('gpu')
+
+            for dtype in typelist:
+                np_arr = np.random.rand(*self.shape).astype(dtype)
+                np_fw = np.random.randint(
+                    10, size=self.weightshape).astype('int32')
+                tensor = paddle.to_tensor(np_arr, place=p)
+                fweights = paddle.to_tensor(np_fw, place=p)
+                cov = paddle.linalg.cov(tensor,
+                                        rowvar=True,
+                                        ddof=True,
+                                        fweights=fweights,
+                                        aweights=None)
+                np_cov = numpy_cov(
+                    np_arr, rowvar=True, ddof=1, fweights=np_fw, aweights=None)
+                self.assertTrue(np.allclose(np_cov, cov.numpy()))
+
+    def test_tensor_cov_aweights(self):
+        typelist = ['float64']
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for idx, p in enumerate(places):
+            if idx == 0:
+                paddle.set_device('cpu')
+            else:
+                paddle.set_device('gpu')
+
+            for dtype in typelist:
+                np_arr = np.random.rand(*self.shape).astype(dtype)
+                np_aw = np.random.randint(
+                    10, size=self.weightshape).astype('int32')
+                tensor = paddle.to_tensor(np_arr, place=p)
+                aweights = paddle.to_tensor(np_aw, place=p)
+                cov = paddle.linalg.cov(tensor,
+                                        rowvar=True,
+                                        ddof=True,
+                                        fweights=None,
+                                        aweights=aweights)
+                np_cov = numpy_cov(
+                    np_arr, rowvar=True, ddof=1, fweights=None, aweights=np_aw)
+                self.assertTrue(np.allclose(np_cov, cov.numpy()))
+
+    def test_tensor_cov_weights(self):
+        typelist = ['float64']
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for idx, p in enumerate(places):
+            if idx == 0:
+                paddle.set_device('cpu')
+            else:
+                paddle.set_device('gpu')
+
+            for dtype in typelist:
+                np_arr = np.random.rand(*self.shape).astype(dtype)
+                np_fw = np.random.randint(
+                    10, size=self.weightshape).astype('int64')
+                np_aw = np.random.rand(*self.weightshape).astype('float64')
+                tensor = paddle.to_tensor(np_arr, place=p)
+                fweights = paddle.to_tensor(np_fw, place=p)
+                aweights = paddle.to_tensor(np_aw, place=p)
+                cov = paddle.linalg.cov(tensor,
+                                        rowvar=True,
+                                        ddof=True,
+                                        fweights=fweights,
+                                        aweights=aweights)
+                np_cov = numpy_cov(
+                    np_arr, rowvar=True, ddof=1, fweights=np_fw, aweights=np_aw)
+                self.assertTrue(np.allclose(np_cov, cov.numpy()))
+
+
+class Cov_Test2(Cov_Test):
+    def setUp(self):
+        self.shape = [10]
+        self.weightshape = [10]
+
+
+# Input(x) only support N-D (1<=N<=2) tensor
+class Cov_Test3(unittest.TestCase):
+    def setUp(self):
+        self.shape = [2, 5, 10]
+        self.fweightshape = [10]
+        self.aweightshape = [10]
+        self.fw_s = 1.
+        self.aw_s = 1.
+
+    def test_errors(self):
+        def test_err():
+            np_arr = np.random.rand(*self.shape).astype('float64')
+            np_fw = self.fw_s * np.random.rand(
+                *self.fweightshape).astype('int32')
+            np_aw = self.aw_s * np.random.rand(
+                *self.aweightshape).astype('float64')
+            tensor = paddle.to_tensor(np_arr)
+            fweights = paddle.to_tensor(np_fw)
+            aweights = paddle.to_tensor(np_aw)
+            cov = paddle.linalg.cov(tensor,
+                                    rowvar=True,
+                                    ddof=True,
+                                    fweights=fweights,
+                                    aweights=aweights)
+
+        self.assertRaises(ValueError, test_err)
+
+
+#Input(fweights) only support N-D (N<=1) tensor
+class Cov_Test4(Cov_Test3):
+    def setUp(self):
+        self.shape = [5, 10]
+        self.fweightshape = [2, 10]
+        self.aweightshape = [10]
+        self.fw_s = 1.
+        self.aw_s = 1.
+
+
+#The number of Input(fweights) should equal to x's dim[1]
+class Cov_Test5(Cov_Test3):
+    def setUp(self):
+        self.shape = [5, 10]
+        self.fweightshape = [5]
+        self.aweightshape = [10]
+        self.fw_s = 1.
+        self.aw_s = 1.
+
+
+#The value of Input(fweights) cannot be negtive
+class Cov_Test6(Cov_Test3):
+    def setUp(self):
+        self.shape = [5, 10]
+        self.fweightshape = [10]
+        self.aweightshape = [10]
+        self.fw_s = -1.
+        self.aw_s = 1.
+
+
+#Input(aweights) only support N-D (N<=1) tensor
+class Cov_Test7(Cov_Test3):
+    def setUp(self):
+        self.shape = [5, 10]
+        self.fweightshape = [10]
+        self.aweightshape = [2, 10]
+        self.fw_s = 1.
+        self.aw_s = 1.
+
+
+#The number of Input(aweights) should equal to x's dim[1]
+class Cov_Test8(Cov_Test3):
+    def setUp(self):
+        self.shape = [5, 10]
+        self.fweightshape = [10]
+        self.aweightshape = [5]
+        self.fw_s = 1.
+        self.aw_s = 1.
+
+
+#The value of Input(aweights) cannot be negtive
+class Cov_Test9(Cov_Test3):
+    def setUp(self):
+        self.shape = [5, 10]
+        self.fweightshape = [10]
+        self.aweightshape = [10]
+        self.fw_s = 1.
+        self.aw_s = -1.
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index c9134867ef2aa..654397b6c201f 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -140,7 +140,7 @@ def test_ops_elementwise_mul(self):
         b = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
         x = paddle.to_tensor(a)
         y = paddle.to_tensor(b)
-        res = core.ops.elementwise_mul(x, y)
+        res = _C_ops.elementwise_mul(x, y)
 
         # expected
         expected = LOWEST_WARNING_POSTION
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index 668b4ad872f43..4b1f0ee85d944 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -62,10 +62,6 @@ def test_gradient_merge_optimizer(self):
         self.assertEqual(sends, 0)
         self.assertEqual(sgds, 0)
 
-        fleet.init_worker()
-        time.sleep(8)
-        fleet.stop_worker()
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
deleted file mode 100644
index 6cf2c5f6e2ca4..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ /dev/null
@@ -1,1299 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import unittest
-import paddle
-from paddle import fluid
-from paddle.fluid import layers
-from paddle.distribution import *
-import math
-
-
-class DistributionNumpy():
-    def sample(self):
-        raise NotImplementedError
-
-    def entropy(self):
-        raise NotImplementedError
-
-    def kl_divergence(self, other):
-        raise NotImplementedError
-
-    def log_prob(self, value):
-        raise NotImplementedError
-
-    def probs(self, value):
-        raise NotImplementedError
-
-
-class UniformNumpy(DistributionNumpy):
-    def __init__(self, low, high):
-        self.low = np.array(low)
-        self.high = np.array(high)
-        if str(self.low.dtype) not in ['float32', 'float64']:
-            self.low = self.low.astype('float32')
-            self.high = self.high.astype('float32')
-
-    def sample(self, shape):
-        shape = tuple(shape) + (self.low + self.high).shape
-        return self.low + (np.random.uniform(size=shape) *
-                           (self.high - self.low))
-
-    def log_prob(self, value):
-        lb = np.less(self.low, value).astype(self.low.dtype)
-        ub = np.less(value, self.high).astype(self.low.dtype)
-        return np.log(lb * ub) - np.log(self.high - self.low)
-
-    def probs(self, value):
-        lb = np.less(self.low, value).astype(self.low.dtype)
-        ub = np.less(value, self.high).astype(self.low.dtype)
-        return (lb * ub) / (self.high - self.low)
-
-    def entropy(self):
-        return np.log(self.high - self.low)
-
-
-class UniformTest(unittest.TestCase):
-    def setUp(self, use_gpu=False, batch_size=5, dims=6):
-        self.use_gpu = use_gpu
-        if not use_gpu:
-            self.place = fluid.CPUPlace()
-            self.gpu_id = -1
-        else:
-            self.place = fluid.CUDAPlace(0)
-            self.gpu_id = 0
-
-        self.init_numpy_data(batch_size, dims)
-
-        paddle.disable_static(self.place)
-        self.init_dynamic_data(batch_size, dims)
-
-        paddle.enable_static()
-        self.test_program = fluid.Program()
-        self.executor = fluid.Executor(self.place)
-        self.init_static_data(batch_size, dims)
-
-    def init_numpy_data(self, batch_size, dims):
-        # low ans high are 'float'
-        self.low_np = np.random.uniform(-2, 1)
-        self.high_np = np.random.uniform(2, 4)
-        self.values_np = np.array([1.0]).astype('float32')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_low = self.low_np
-        self.dynamic_high = self.high_np
-        self.dynamic_values = paddle.to_tensor(self.values_np)
-
-    def init_static_data(self, batch_size, dims):
-        self.static_low = self.low_np
-        self.static_high = self.high_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[], dtype='float32')
-
-    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
-        sample, entropy, log_prob, probs = fetch_list
-
-        np_uniform = UniformNumpy(self.low_np, self.high_np)
-        np_sample = np_uniform.sample([sample_shape])
-        np_entropy = np_uniform.entropy()
-        np_lp = np_uniform.log_prob(self.values_np)
-        np_p = np_uniform.probs(self.values_np)
-
-        np.testing.assert_equal(sample.shape, np_sample.shape)
-        np.testing.assert_allclose(
-            entropy, np_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            log_prob, np_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
-
-    def test_uniform_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
-        paddle.disable_static(self.place)
-        uniform = Uniform(self.dynamic_low, self.dynamic_high)
-        sample = uniform.sample([sample_shape]).numpy()
-        entropy = uniform.entropy().numpy()
-        log_prob = uniform.log_prob(self.dynamic_values).numpy()
-        probs = uniform.probs(self.dynamic_values).numpy()
-        fetch_list = [sample, entropy, log_prob, probs]
-
-        self.compare_with_numpy(fetch_list)
-
-    def test_uniform_distribution_static(self, sample_shape=7, tolerance=1e-6):
-        paddle.enable_static()
-        with fluid.program_guard(self.test_program):
-            uniform = Uniform(self.static_low, self.static_high)
-            sample = uniform.sample([sample_shape])
-            entropy = uniform.entropy()
-            log_prob = uniform.log_prob(self.static_values)
-            probs = uniform.probs(self.static_values)
-            fetch_list = [sample, entropy, log_prob, probs]
-
-        feed_vars = {
-            'low': self.low_np,
-            'high': self.high_np,
-            'values': self.values_np
-        }
-
-        self.executor.run(fluid.default_startup_program())
-        fetch_list = self.executor.run(program=self.test_program,
-                                       feed=feed_vars,
-                                       fetch_list=fetch_list)
-
-        self.compare_with_numpy(fetch_list)
-
-
-class UniformTest2(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low ans high are 'int'
-        self.low_np = int(np.random.uniform(-2, 1))
-        self.high_np = int(np.random.uniform(2, 4))
-        self.values_np = np.array([1.0]).astype('float32')
-
-
-class UniformTest3(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # test broadcast: low is float, high is numpy.ndarray with dtype 'float32'.
-        self.low_np = np.random.uniform(-2, 1)
-        self.high_np = np.random.uniform(5.0, 15.0,
-                                         (batch_size, dims)).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_low = self.low_np
-        self.static_high = self.high_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class UniformTest4(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low and high are numpy.ndarray with dtype 'float32'.
-        self.low_np = np.random.randn(batch_size, dims).astype('float32')
-        self.high_np = np.random.uniform(5.0, 15.0,
-                                         (batch_size, dims)).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_low = self.low_np
-        self.static_high = self.high_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class UniformTest5(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low and high are numpy.ndarray with dtype 'float64'.
-        self.low_np = np.random.randn(batch_size, dims).astype('float64')
-        self.high_np = np.random.uniform(5.0, 15.0,
-                                         (batch_size, dims)).astype('float64')
-        self.values_np = np.random.randn(batch_size, dims).astype('float64')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_low = self.low_np
-        self.dynamic_high = self.high_np
-        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_low = self.low_np
-        self.static_high = self.high_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float64')
-
-
-class UniformTest6(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low and high are Tensor with dtype 'VarType.FP32'.
-        self.low_np = np.random.randn(batch_size, dims).astype('float32')
-        self.high_np = np.random.uniform(5.0, 15.0,
-                                         (batch_size, dims)).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_low = paddle.to_tensor(self.low_np)
-        self.dynamic_high = paddle.to_tensor(self.high_np)
-        self.dynamic_values = paddle.to_tensor(self.values_np)
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.static_low = layers.data(
-                name='low', shape=[dims], dtype='float32')
-            self.static_high = layers.data(
-                name='high', shape=[dims], dtype='float32')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class UniformTest7(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low and high are Tensor with dtype 'VarType.FP64'.
-        self.low_np = np.random.randn(batch_size, dims).astype('float64')
-        self.high_np = np.random.uniform(5.0, 15.0,
-                                         (batch_size, dims)).astype('float64')
-        self.values_np = np.random.randn(batch_size, dims).astype('float64')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
-        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
-        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.static_low = layers.data(
-                name='low', shape=[dims], dtype='float64')
-            self.static_high = layers.data(
-                name='high', shape=[dims], dtype='float64')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float64')
-
-
-class UniformTest8(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low and high are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
-        self.low_np = np.random.randn(batch_size, dims).astype('float64')
-        self.high_np = np.random.uniform(5.0, 15.0,
-                                         (batch_size, dims)).astype('float64')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
-        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
-        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float32')
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.static_low = layers.data(
-                name='low', shape=[dims], dtype='float64')
-            self.static_high = layers.data(
-                name='high', shape=[dims], dtype='float64')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class UniformTest9(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low and high are numpy.ndarray with dtype 'float32'.
-        # high < low.
-        self.low_np = np.random.randn(batch_size, dims).astype('float32')
-        self.high_np = np.random.uniform(-10.0, -5.0,
-                                         (batch_size, dims)).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_low = self.low_np
-        self.static_high = self.high_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class UniformTest10(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low and high are list.
-        self.low_np = np.random.randn(batch_size,
-                                      dims).astype('float32').tolist()
-        self.high_np = np.random.uniform(
-            5.0, 15.0, (batch_size, dims)).astype('float32').tolist()
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_low = self.low_np
-        self.static_high = self.high_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class UniformTest11(UniformTest):
-    def init_numpy_data(self, batch_size, dims):
-        # low and high are tuple.
-        self.low_np = tuple(
-            np.random.randn(batch_size, dims).astype('float32').tolist())
-        self.high_np = tuple(
-            np.random.uniform(5.0, 15.0, (batch_size, dims)).astype('float32')
-            .tolist())
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_low = self.low_np
-        self.static_high = self.high_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class UniformTestSample(unittest.TestCase):
-    def setUp(self):
-        self.init_param()
-
-    def init_param(self):
-        self.low = 3.0
-        self.high = 4.0
-
-    def test_uniform_sample(self):
-        paddle.disable_static()
-        uniform = Uniform(low=self.low, high=self.high)
-        s = uniform.sample([100])
-        self.assertTrue((s >= self.low).all())
-        self.assertTrue((s < self.high).all())
-        paddle.enable_static()
-
-
-class UniformTestSample2(UniformTestSample):
-    def init_param(self):
-        self.low = -5.0
-        self.high = 2.0
-
-
-class NormalNumpy(DistributionNumpy):
-    def __init__(self, loc, scale):
-        self.loc = np.array(loc)
-        self.scale = np.array(scale)
-        if str(self.loc.dtype) not in ['float32', 'float64']:
-            self.loc = self.loc.astype('float32')
-            self.scale = self.scale.astype('float32')
-
-    def sample(self, shape):
-        shape = tuple(shape) + (self.loc + self.scale).shape
-        return self.loc + (np.random.randn(*shape) * self.scale)
-
-    def log_prob(self, value):
-        var = self.scale * self.scale
-        log_scale = np.log(self.scale)
-        return -((value - self.loc) * (value - self.loc)) / (
-            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
-
-    def probs(self, value):
-        var = self.scale * self.scale
-        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
-                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
-
-    def entropy(self):
-        return 0.5 + 0.5 * np.log(
-            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
-
-    def kl_divergence(self, other):
-        var_ratio = (self.scale / other.scale)
-        var_ratio = var_ratio * var_ratio
-        t1 = ((self.loc - other.loc) / other.scale)
-        t1 = (t1 * t1)
-        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
-
-
-class NormalTest(unittest.TestCase):
-    def setUp(self, use_gpu=False, batch_size=2, dims=3):
-        self.use_gpu = use_gpu
-        if not use_gpu:
-            self.place = fluid.CPUPlace()
-            self.gpu_id = -1
-        else:
-            self.place = fluid.CUDAPlace(0)
-            self.gpu_id = 0
-
-        self.init_numpy_data(batch_size, dims)
-
-        paddle.disable_static(self.place)
-        self.init_dynamic_data(batch_size, dims)
-
-        paddle.enable_static()
-        self.test_program = fluid.Program()
-        self.executor = fluid.Executor(self.place)
-        self.init_static_data(batch_size, dims)
-
-    def init_numpy_data(self, batch_size, dims):
-        # loc ans scale are 'float'
-        self.loc_np = (np.random.ranf() - 0.5) * 4
-        self.scale_np = (np.random.ranf() - 0.5) * 4
-        while self.scale_np < 0:
-            self.scale_np = (np.random.ranf() - 0.5) * 4
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = (np.random.ranf() - 0.5) * 4
-        self.other_scale_np = (np.random.ranf() - 0.5) * 4
-        while self.other_scale_np < 0:
-            self.other_scale_np = (np.random.ranf() - 0.5) * 4
-        self.values_np = np.random.ranf(1).astype('float32')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_loc = self.loc_np
-        self.dynamic_scale = self.scale_np
-        self.dynamic_other_loc = self.other_loc_np
-        self.dynamic_other_scale = self.other_scale_np
-        self.dynamic_values = paddle.to_tensor(self.values_np)
-
-    def init_static_data(self, batch_size, dims):
-        self.static_loc = self.loc_np
-        self.static_scale = self.scale_np
-        self.static_other_loc = self.other_loc_np
-        self.static_other_scale = self.other_scale_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[], dtype='float32')
-
-    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
-        sample, entropy, log_prob, probs, kl = fetch_list
-
-        np_normal = NormalNumpy(self.loc_np, self.scale_np)
-        np_sample = np_normal.sample([sample_shape])
-        np_entropy = np_normal.entropy()
-        np_lp = np_normal.log_prob(self.values_np)
-        np_p = np_normal.probs(self.values_np)
-        np_other_normal = NormalNumpy(self.other_loc_np, self.other_scale_np)
-        np_kl = np_normal.kl_divergence(np_other_normal)
-
-        # Because assign op does not support the input of numpy.ndarray whose dtype is FP64.
-        # When loc and scale are FP64 numpy.ndarray, we need to use assign op to convert it
-        #  to FP32 Tensor. And then use cast op to convert it to a FP64 Tensor.
-        # There is a loss of accuracy in this conversion.
-        # So set the tolerance from 1e-6 to 1e-4.
-        log_tolerance = 1e-4
-
-        np.testing.assert_equal(sample.shape, np_sample.shape)
-        np.testing.assert_allclose(
-            entropy, np_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            log_prob, np_lp, rtol=log_tolerance, atol=log_tolerance)
-        np.testing.assert_allclose(
-            probs, np_p, rtol=log_tolerance, atol=log_tolerance)
-        np.testing.assert_allclose(
-            kl, np_kl, rtol=log_tolerance, atol=log_tolerance)
-
-    def test_normal_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
-        paddle.disable_static(self.place)
-        normal = Normal(self.dynamic_loc, self.dynamic_scale)
-
-        sample = normal.sample([sample_shape]).numpy()
-        entropy = normal.entropy().numpy()
-        log_prob = normal.log_prob(self.dynamic_values).numpy()
-        probs = normal.probs(self.dynamic_values).numpy()
-        other_normal = Normal(self.dynamic_other_loc, self.dynamic_other_scale)
-        kl = normal.kl_divergence(other_normal).numpy()
-
-        fetch_list = [sample, entropy, log_prob, probs, kl]
-        self.compare_with_numpy(fetch_list)
-
-    def test_normal_distribution_static(self, sample_shape=7, tolerance=1e-6):
-        paddle.enable_static()
-        with fluid.program_guard(self.test_program):
-            normal = Normal(self.static_loc, self.static_scale)
-
-            sample = normal.sample([sample_shape])
-            entropy = normal.entropy()
-            log_prob = normal.log_prob(self.static_values)
-            probs = normal.probs(self.static_values)
-            other_normal = Normal(self.static_other_loc,
-                                  self.static_other_scale)
-            kl = normal.kl_divergence(other_normal)
-
-            fetch_list = [sample, entropy, log_prob, probs, kl]
-
-        feed_vars = {
-            'loc': self.loc_np,
-            'scale': self.scale_np,
-            'values': self.values_np,
-            'other_loc': self.other_loc_np,
-            'other_scale': self.other_scale_np
-        }
-
-        self.executor.run(fluid.default_startup_program())
-        fetch_list = self.executor.run(program=self.test_program,
-                                       feed=feed_vars,
-                                       fetch_list=fetch_list)
-
-        self.compare_with_numpy(fetch_list)
-
-
-class NormalTest2(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # loc ans scale are 'int'
-        self.loc_np = int((np.random.ranf() - 0.5) * 8)
-        self.scale_np = int((np.random.ranf() - 0.5) * 8)
-        while self.scale_np < 0:
-            self.scale_np = int((np.random.ranf() - 0.5) * 8)
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = int((np.random.ranf() - 0.5) * 8)
-        self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
-        while self.other_scale_np < 0:
-            self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
-        self.values_np = np.random.ranf(1).astype('float32')
-
-
-class NormalTest3(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # test broadcast: loc is float, scale is numpy.ndarray with dtype 'float32'.
-        self.loc_np = (np.random.ranf() - 0.5) * 4
-        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = (np.random.ranf() - 0.5) * 4
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float32')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_loc = self.loc_np
-        self.static_scale = self.scale_np
-        self.static_other_loc = self.other_loc_np
-        self.static_other_scale = self.other_scale_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class NormalTest4(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # loc and scale are numpy.ndarray with dtype 'float32'.
-        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
-        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float32')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_loc = self.loc_np
-        self.static_scale = self.scale_np
-        self.static_other_loc = self.other_loc_np
-        self.static_other_scale = self.other_scale_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class NormalTest5(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # loc and scale are numpy.ndarray with dtype 'float64'.
-        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
-        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
-        self.values_np = np.random.randn(batch_size, dims).astype('float64')
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float64')
-        while not np.all(self.scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float64')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_loc = self.loc_np
-        self.dynamic_scale = self.scale_np
-        self.dynamic_other_loc = self.other_loc_np
-        self.dynamic_other_scale = self.other_scale_np
-        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
-
-    def init_static_data(self, batch_size, dims):
-        self.static_loc = self.loc_np
-        self.static_scale = self.scale_np
-        self.static_other_loc = self.other_loc_np
-        self.static_other_scale = self.other_scale_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float64')
-
-
-class NormalTest6(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # loc and scale are Tensor with dtype 'VarType.FP32'.
-        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
-        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float32')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_loc = paddle.to_tensor(self.loc_np)
-        self.dynamic_scale = paddle.to_tensor(self.scale_np)
-        self.dynamic_values = paddle.to_tensor(self.values_np)
-        self.dynamic_other_loc = paddle.to_tensor(self.other_loc_np)
-        self.dynamic_other_scale = paddle.to_tensor(self.other_scale_np)
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.static_loc = layers.data(
-                name='loc', shape=[dims], dtype='float32')
-            self.static_scale = layers.data(
-                name='scale', shape=[dims], dtype='float32')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-            self.static_other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float32')
-            self.static_other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float32')
-
-
-class NormalTest7(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # loc and scale are Tensor with dtype 'VarType.FP64'.
-        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
-        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
-        self.values_np = np.random.randn(batch_size, dims).astype('float64')
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float64')
-        while not np.all(self.scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float64')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
-        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
-        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
-        self.dynamic_other_loc = paddle.to_tensor(
-            self.other_loc_np, dtype='float64')
-        self.dynamic_other_scale = paddle.to_tensor(
-            self.other_scale_np, dtype='float64')
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.static_loc = layers.data(
-                name='loc', shape=[dims], dtype='float64')
-            self.static_scale = layers.data(
-                name='scale', shape=[dims], dtype='float64')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float64')
-            self.static_other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float64')
-            self.static_other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float64')
-
-
-class NormalTest8(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # loc and scale are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
-        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
-        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float64')
-        while not np.all(self.scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float64')
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
-        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
-        self.dynamic_values = paddle.to_tensor(self.values_np)
-        self.dynamic_other_loc = paddle.to_tensor(
-            self.other_loc_np, dtype='float64')
-        self.dynamic_other_scale = paddle.to_tensor(
-            self.other_scale_np, dtype='float64')
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.static_loc = layers.data(
-                name='loc', shape=[dims], dtype='float64')
-            self.static_scale = layers.data(
-                name='scale', shape=[dims], dtype='float64')
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-            self.static_other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float64')
-            self.static_other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float64')
-
-
-class NormalTest9(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # loc and scale are list.
-        self.loc_np = np.random.randn(batch_size,
-                                      dims).astype('float32').tolist()
-        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        self.scale_np = self.scale_np.tolist()
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = np.random.randn(batch_size,
-                                            dims).astype('float32').tolist()
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float32')
-        while not np.all(self.other_scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float32')
-        self.other_scale_np = self.other_scale_np.tolist()
-
-    def init_static_data(self, batch_size, dims):
-        self.static_loc = self.loc_np
-        self.static_scale = self.scale_np
-        self.static_other_loc = self.other_loc_np
-        self.static_other_scale = self.other_scale_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class NormalTest10(NormalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # loc and scale are tuple.
-        self.loc_np = tuple(
-            np.random.randn(batch_size, dims).astype('float32').tolist())
-        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        self.scale_np = tuple(self.scale_np.tolist())
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = tuple(
-            np.random.randn(batch_size, dims).astype('float32').tolist())
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float32')
-        while not np.all(self.other_scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float32')
-        self.other_scale_np = tuple(self.other_scale_np.tolist())
-
-    def init_static_data(self, batch_size, dims):
-        self.static_loc = self.loc_np
-        self.static_scale = self.scale_np
-        self.static_other_loc = self.other_loc_np
-        self.static_other_scale = self.other_scale_np
-        with fluid.program_guard(self.test_program):
-            self.static_values = layers.data(
-                name='values', shape=[dims], dtype='float32')
-
-
-class CategoricalNumpy(DistributionNumpy):
-    def __init__(self, logits):
-        self.logits = np.array(logits).astype('float32')
-
-    def entropy(self):
-        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
-        e_logits = np.exp(logits)
-        z = np.sum(e_logits, axis=-1, keepdims=True)
-        prob = e_logits / z
-        return -1. * np.sum(prob * (logits - np.log(z)), axis=-1, keepdims=True)
-
-    def kl_divergence(self, other):
-        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
-        other_logits = other.logits - np.max(
-            other.logits, axis=-1, keepdims=True)
-        e_logits = np.exp(logits)
-        other_e_logits = np.exp(other_logits)
-        z = np.sum(e_logits, axis=-1, keepdims=True)
-        other_z = np.sum(other_e_logits, axis=-1, keepdims=True)
-        prob = e_logits / z
-        return np.sum(prob * (logits - np.log(z) - other_logits \
-            + np.log(other_z)), axis=-1, keepdims=True)
-
-
-class CategoricalTest(unittest.TestCase):
-    def setUp(self, use_gpu=False, batch_size=3, dims=5):
-        self.use_gpu = use_gpu
-        if not use_gpu:
-            self.place = fluid.CPUPlace()
-            self.gpu_id = -1
-        else:
-            self.place = fluid.CUDAPlace(0)
-            self.gpu_id = 0
-
-        self.batch_size = batch_size
-        self.dims = dims
-        self.init_numpy_data(batch_size, dims)
-
-        paddle.disable_static(self.place)
-        self.init_dynamic_data(batch_size, dims)
-
-        paddle.enable_static()
-        self.test_program = fluid.Program()
-        self.executor = fluid.Executor(self.place)
-        self.init_static_data(batch_size, dims)
-
-    def init_numpy_data(self, batch_size, dims):
-        # input logtis is 2-D Tensor
-        # value used in probs and log_prob method is 1-D Tensor
-        self.logits_np = np.random.rand(batch_size, dims).astype('float32')
-        self.other_logits_np = np.random.rand(batch_size,
-                                              dims).astype('float32')
-        self.value_np = np.array([2, 1, 3]).astype('int64')
-
-        self.logits_shape = [batch_size, dims]
-        # dist_shape = logits_shape[:-1], it represents the number of 
-        #  different distributions.
-        self.dist_shape = [batch_size]
-        # sample shape represents the number of samples
-        self.sample_shape = [2, 4]
-        # value used in probs and log_prob method
-        # If value is 1-D and logits is 2-D or higher dimension, value will be
-        #  broadcasted to have the same number of distributions with logits.
-        # If value is 2-D or higher dimentsion, it should have the same number 
-        #  of distributions with logtis. ``value[:-1] = logits[:-1]
-        self.value_shape = [3]
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.logits = paddle.to_tensor(self.logits_np)
-        self.other_logits = paddle.to_tensor(self.other_logits_np)
-        self.value = paddle.to_tensor(self.value_np)
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.logits_static = fluid.data(
-                name='logits', shape=self.logits_shape, dtype='float32')
-            self.other_logits_static = fluid.data(
-                name='other_logits', shape=self.logits_shape, dtype='float32')
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
-
-    def get_numpy_selected_probs(self, probability):
-        np_probs = np.zeros(self.dist_shape + self.value_shape)
-        for i in range(self.batch_size):
-            for j in range(3):
-                np_probs[i][j] = probability[i][self.value_np[j]]
-        return np_probs
-
-    def compare_with_numpy(self, fetch_list, tolerance=1e-6):
-        sample, entropy, kl, probs, log_prob = fetch_list
-        log_tolerance = 1e-4
-
-        np.testing.assert_equal(sample.shape,
-                                self.sample_shape + self.dist_shape)
-
-        np_categorical = CategoricalNumpy(self.logits_np)
-        np_other_categorical = CategoricalNumpy(self.other_logits_np)
-        np_entropy = np_categorical.entropy()
-        np_kl = np_categorical.kl_divergence(np_other_categorical)
-
-        np.testing.assert_allclose(
-            entropy, np_entropy, rtol=log_tolerance, atol=log_tolerance)
-        np.testing.assert_allclose(
-            kl, np_kl, rtol=log_tolerance, atol=log_tolerance)
-
-        sum_dist = np.sum(self.logits_np, axis=-1, keepdims=True)
-        probability = self.logits_np / sum_dist
-        np_probs = self.get_numpy_selected_probs(probability)
-        np_log_prob = np.log(np_probs)
-
-        np.testing.assert_allclose(
-            probs, np_probs, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            log_prob, np_log_prob, rtol=tolerance, atol=tolerance)
-
-    def test_categorical_distribution_dygraph(self, tolerance=1e-6):
-        paddle.disable_static(self.place)
-        categorical = Categorical(self.logits)
-        other_categorical = Categorical(self.other_logits)
-
-        sample = categorical.sample(self.sample_shape).numpy()
-        entropy = categorical.entropy().numpy()
-        kl = categorical.kl_divergence(other_categorical).numpy()
-        probs = categorical.probs(self.value).numpy()
-        log_prob = categorical.log_prob(self.value).numpy()
-
-        fetch_list = [sample, entropy, kl, probs, log_prob]
-        self.compare_with_numpy(fetch_list)
-
-    def test_categorical_distribution_static(self, tolerance=1e-6):
-        paddle.enable_static()
-        with fluid.program_guard(self.test_program):
-            categorical = Categorical(self.logits_static)
-            other_categorical = Categorical(self.other_logits_static)
-
-            sample = categorical.sample(self.sample_shape)
-            entropy = categorical.entropy()
-            kl = categorical.kl_divergence(other_categorical)
-            probs = categorical.probs(self.value_static)
-            log_prob = categorical.log_prob(self.value_static)
-
-            fetch_list = [sample, entropy, kl, probs, log_prob]
-
-        feed_vars = {
-            'logits': self.logits_np,
-            'other_logits': self.other_logits_np,
-            'value': self.value_np
-        }
-
-        self.executor.run(fluid.default_startup_program())
-        fetch_list = self.executor.run(program=self.test_program,
-                                       feed=feed_vars,
-                                       fetch_list=fetch_list)
-
-        self.compare_with_numpy(fetch_list)
-
-
-class CategoricalTest2(CategoricalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # input logtis is 2-D Tensor with dtype Float64
-        # value used in probs and log_prob method is 1-D Tensor
-        self.logits_np = np.random.rand(batch_size, dims).astype('float64')
-        self.other_logits_np = np.random.rand(batch_size,
-                                              dims).astype('float64')
-        self.value_np = np.array([2, 1, 3]).astype('int64')
-
-        self.logits_shape = [batch_size, dims]
-        self.dist_shape = [batch_size]
-        self.sample_shape = [2, 4]
-        self.value_shape = [3]
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.logits_static = fluid.data(
-                name='logits', shape=self.logits_shape, dtype='float64')
-            self.other_logits_static = fluid.data(
-                name='other_logits', shape=self.logits_shape, dtype='float64')
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
-
-
-class CategoricalTest3(CategoricalTest):
-    def init_dynamic_data(self, batch_size, dims):
-        # input logtis is 2-D numpy.ndarray with dtype Float32
-        # value used in probs and log_prob method is 1-D Tensor
-        self.logits = self.logits_np
-        self.other_logits = self.other_logits_np
-        self.value = paddle.to_tensor(self.value_np)
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.logits_static = self.logits_np
-            self.other_logits_static = self.other_logits_np
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
-
-
-class CategoricalTest4(CategoricalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # input logtis is 2-D numpy.ndarray with dtype Float64
-        # value used in probs and log_prob method is 1-D Tensor
-        self.logits_np = np.random.rand(batch_size, dims).astype('float64')
-        self.other_logits_np = np.random.rand(batch_size,
-                                              dims).astype('float64')
-        self.value_np = np.array([2, 1, 3]).astype('int64')
-
-        self.logits_shape = [batch_size, dims]
-        self.dist_shape = [batch_size]
-        self.sample_shape = [2, 4]
-        self.value_shape = [3]
-
-    def init_dynamic_data(self, batch_size, dims):
-        self.logits = self.logits_np
-        self.other_logits = self.other_logits_np
-        self.value = paddle.to_tensor(self.value_np)
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.logits_static = self.logits_np
-            self.other_logits_static = self.other_logits_np
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
-
-
-# test shape of logits and value used in probs and log_prob method
-class CategoricalTest5(CategoricalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # input logtis is 1-D Tensor
-        # value used in probs and log_prob method is 1-D Tensor
-        self.logits_np = np.random.rand(dims).astype('float32')
-        self.other_logits_np = np.random.rand(dims).astype('float32')
-        self.value_np = np.array([2, 1, 3]).astype('int64')
-
-        self.logits_shape = [dims]
-        self.dist_shape = []
-        self.sample_shape = [2, 4]
-        self.value_shape = [3]
-
-    def get_numpy_selected_probs(self, probability):
-        np_probs = np.zeros(self.value_shape)
-        for i in range(3):
-            np_probs[i] = probability[self.value_np[i]]
-        return np_probs
-
-
-class CategoricalTest6(CategoricalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # input logtis is 2-D Tensor
-        # value used in probs and log_prob method has the same number of batches with input
-        self.logits_np = np.random.rand(3, 5).astype('float32')
-        self.other_logits_np = np.random.rand(3, 5).astype('float32')
-        self.value_np = np.array([[2, 1], [0, 3], [2, 3]]).astype('int64')
-
-        self.logits_shape = [3, 5]
-        self.dist_shape = [3]
-        self.sample_shape = [2, 4]
-        self.value_shape = [3, 2]
-
-    def get_numpy_selected_probs(self, probability):
-        np_probs = np.zeros(self.value_shape)
-        for i in range(3):
-            for j in range(2):
-                np_probs[i][j] = probability[i][self.value_np[i][j]]
-        return np_probs
-
-
-class CategoricalTest7(CategoricalTest):
-    def init_numpy_data(self, batch_size, dims):
-        # input logtis is 3-D Tensor
-        # value used in probs and log_prob method has the same number of distribuions with input
-        self.logits_np = np.random.rand(3, 2, 5).astype('float32')
-        self.other_logits_np = np.random.rand(3, 2, 5).astype('float32')
-        self.value_np = np.array([2, 1, 3]).astype('int64')
-
-        self.logits_shape = [3, 2, 5]
-        self.dist_shape = [3, 2]
-        self.sample_shape = [2, 4]
-        self.value_shape = [3]
-
-    def get_numpy_selected_probs(self, probability):
-        np_probs = np.zeros(self.dist_shape + self.value_shape)
-        for i in range(3):
-            for j in range(2):
-                for k in range(3):
-                    np_probs[i][j][k] = probability[i][j][self.value_np[k]]
-        return np_probs
-
-
-class CategoricalTest8(CategoricalTest):
-    def init_dynamic_data(self, batch_size, dims):
-        # input logtis is 2-D list
-        # value used in probs and log_prob method is 1-D Tensor
-        self.logits = self.logits_np.tolist()
-        self.other_logits = self.other_logits_np.tolist()
-        self.value = paddle.to_tensor(self.value_np)
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.logits_static = self.logits_np.tolist()
-            self.other_logits_static = self.other_logits_np.tolist()
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
-
-
-class CategoricalTest9(CategoricalTest):
-    def init_dynamic_data(self, batch_size, dims):
-        # input logtis is 2-D tuple
-        # value used in probs and log_prob method is 1-D Tensor
-        self.logits = tuple(self.logits_np.tolist())
-        self.other_logits = tuple(self.other_logits_np.tolist())
-        self.value = paddle.to_tensor(self.value_np)
-
-    def init_static_data(self, batch_size, dims):
-        with fluid.program_guard(self.test_program):
-            self.logits_static = tuple(self.logits_np.tolist())
-            self.other_logits_static = tuple(self.other_logits_np.tolist())
-            self.value_static = fluid.data(
-                name='value', shape=self.value_shape, dtype='int64')
-
-
-class DistributionTestError(unittest.TestCase):
-    def test_distribution_error(self):
-        distribution = Distribution()
-
-        self.assertRaises(NotImplementedError, distribution.sample)
-        self.assertRaises(NotImplementedError, distribution.entropy)
-
-        normal = Normal(0.0, 1.0)
-        self.assertRaises(NotImplementedError, distribution.kl_divergence,
-                          normal)
-
-        value_npdata = np.array([0.8], dtype="float32")
-        value_tensor = layers.create_tensor(dtype="float32")
-        self.assertRaises(NotImplementedError, distribution.log_prob,
-                          value_tensor)
-        self.assertRaises(NotImplementedError, distribution.probs, value_tensor)
-
-    def test_normal_error(self):
-        paddle.enable_static()
-        normal = Normal(0.0, 1.0)
-
-        value = [1.0, 2.0]
-        # type of value must be variable
-        self.assertRaises(TypeError, normal.log_prob, value)
-
-        value = [1.0, 2.0]
-        # type of value must be variable
-        self.assertRaises(TypeError, normal.probs, value)
-
-        shape = 1.0
-        # type of shape must be list
-        self.assertRaises(TypeError, normal.sample, shape)
-
-        seed = 1.0
-        # type of seed must be int
-        self.assertRaises(TypeError, normal.sample, [2, 3], seed)
-
-        normal_other = Uniform(1.0, 2.0)
-        # type of other must be an instance of Normal
-        self.assertRaises(TypeError, normal.kl_divergence, normal_other)
-
-    def test_uniform_error(self):
-        paddle.enable_static()
-        uniform = Uniform(0.0, 1.0)
-
-        value = [1.0, 2.0]
-        # type of value must be variable
-        self.assertRaises(TypeError, uniform.log_prob, value)
-
-        value = [1.0, 2.0]
-        # type of value must be variable
-        self.assertRaises(TypeError, uniform.probs, value)
-
-        shape = 1.0
-        # type of shape must be list
-        self.assertRaises(TypeError, uniform.sample, shape)
-
-        seed = 1.0
-        # type of seed must be int
-        self.assertRaises(TypeError, uniform.sample, [2, 3], seed)
-
-    def test_categorical_error(self):
-        paddle.enable_static()
-
-        categorical = Categorical([0.4, 0.6])
-
-        value = [1, 0]
-        # type of value must be variable
-        self.assertRaises(AttributeError, categorical.log_prob, value)
-
-        value = [1, 0]
-        # type of value must be variable
-        self.assertRaises(AttributeError, categorical.probs, value)
-
-        shape = 1.0
-        # type of shape must be list
-        self.assertRaises(TypeError, categorical.sample, shape)
-
-        categorical_other = Uniform(1.0, 2.0)
-        # type of other must be an instance of Categorical
-        self.assertRaises(TypeError, categorical.kl_divergence,
-                          categorical_other)
-
-        def test_shape_not_match_error():
-            # shape of value must match shape of logits
-            # value_shape[:-1] == logits_shape[:-1]
-            paddle.disable_static()
-            logits = paddle.rand([3, 5])
-            cat = Categorical(logits)
-            value = paddle.to_tensor([[2, 1, 3], [3, 2, 1]], dtype='int64')
-            cat.log_prob(value)
-
-        self.assertRaises(ValueError, test_shape_not_match_error)
-
-
-class DistributionTestName(unittest.TestCase):
-    def get_prefix(self, string):
-        return (string.split('.')[0])
-
-    def test_normal_name(self):
-        name = 'test_normal'
-        normal1 = Normal(0.0, 1.0, name=name)
-        self.assertEqual(normal1.name, name)
-
-        normal2 = Normal(0.0, 1.0)
-        self.assertEqual(normal2.name, 'Normal')
-
-        paddle.enable_static()
-
-        sample = normal1.sample([2])
-        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
-
-        entropy = normal1.entropy()
-        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
-
-        value_npdata = np.array([0.8], dtype="float32")
-        value_tensor = layers.create_tensor(dtype="float32")
-        layers.assign(value_npdata, value_tensor)
-
-        lp = normal1.log_prob(value_tensor)
-        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
-
-        p = normal1.probs(value_tensor)
-        self.assertEqual(self.get_prefix(p.name), name + '_probs')
-
-        kl = normal1.kl_divergence(normal2)
-        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
-
-    def test_uniform_name(self):
-        name = 'test_uniform'
-        uniform1 = Uniform(0.0, 1.0, name=name)
-        self.assertEqual(uniform1.name, name)
-
-        uniform2 = Uniform(0.0, 1.0)
-        self.assertEqual(uniform2.name, 'Uniform')
-
-        paddle.enable_static()
-
-        sample = uniform1.sample([2])
-        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
-
-        entropy = uniform1.entropy()
-        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
-
-        value_npdata = np.array([0.8], dtype="float32")
-        value_tensor = layers.create_tensor(dtype="float32")
-        layers.assign(value_npdata, value_tensor)
-
-        lp = uniform1.log_prob(value_tensor)
-        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
-
-        p = uniform1.probs(value_tensor)
-        self.assertEqual(self.get_prefix(p.name), name + '_probs')
-
-    def test_categorical_name(self):
-        name = 'test_categorical'
-        categorical1 = Categorical([0.4, 0.6], name=name)
-        self.assertEqual(categorical1.name, name)
-
-        categorical2 = Categorical([0.5, 0.5])
-        self.assertEqual(categorical2.name, 'Categorical')
-
-        paddle.enable_static()
-
-        sample = categorical1.sample([2])
-        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
-
-        entropy = categorical1.entropy()
-        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
-
-        kl = categorical1.kl_divergence(categorical2)
-        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
-
-        value_npdata = np.array([0], dtype="int64")
-        value_tensor = layers.create_tensor(dtype="int64")
-        layers.assign(value_npdata, value_tensor)
-
-        p = categorical1.probs(value_tensor)
-        self.assertEqual(self.get_prefix(p.name), name + '_probs')
-
-        lp = categorical1.log_prob(value_tensor)
-        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
index 3bf2be3d64bee..45cb7e785bc5e 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import paddle.fluid.core as core
-import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
 import paddle
 import numpy as np
 from paddle.fluid.framework import _test_eager_guard
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 08a68ca246f40..e4576fe2ea8bd 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -13,12 +13,13 @@
 # limitations under the License.
 
 import paddle.fluid.core as core
-import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
 import paddle
 import numpy as np
-from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.framework import _test_eager_guard, EagerParamBase, _in_eager_mode
 from paddle.fluid.data_feeder import convert_dtype
 import unittest
+import copy
+import paddle.compat as cpt
 
 
 class EagerScaleTestCase(unittest.TestCase):
@@ -46,14 +47,42 @@ def test_retain_grad_and_run_backward(self):
             grad_data = np.ones([4, 16, 16, 32]).astype('float32')
             grad_eager = paddle.to_tensor(grad_data, 'float32', core.CPUPlace())
 
-            core.eager.retain_grad_for_tensor(data_eager)
+            data_eager.retain_grads()
 
             out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
             self.assertFalse(data_eager.grad._is_initialized())
-            core.eager.run_backward([out_eager], [grad_eager], False)
+            out_eager.backward(grad_eager, False)
             self.assertTrue(data_eager.grad._is_initialized())
             self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
 
+    def test_retain_grad_and_run_backward_raises(self):
+        with _test_eager_guard():
+            paddle.set_device("cpu")
+
+            input_data = np.ones([4, 16, 16, 32]).astype('float32')
+            data_eager = paddle.to_tensor(input_data, 'float32',
+                                          core.CPUPlace(), False)
+
+            grad_data = np.ones([4, 16, 16, 32]).astype('float32')
+            grad_data2 = np.ones([4, 16]).astype('float32')
+            grad_eager = paddle.to_tensor(grad_data, 'float32', core.CPUPlace())
+            grad_eager2 = paddle.to_tensor(grad_data2, 'float32',
+                                           core.CPUPlace())
+
+            data_eager.retain_grads()
+
+            out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
+            self.assertFalse(data_eager.grad._is_initialized())
+            with self.assertRaisesRegexp(
+                    AssertionError,
+                    "The type of grad_tensor must be paddle.Tensor"):
+                out_eager.backward(grad_data, False)
+
+            with self.assertRaisesRegexp(
+                    AssertionError,
+                    "Tensor shape not match, Tensor of grad_tensor /*"):
+                out_eager.backward(grad_eager2, False)
+
 
 class EagerDtypeTestCase(unittest.TestCase):
     def check_to_tesnsor_and_numpy(self, dtype, proto_dtype):
@@ -192,6 +221,64 @@ def constructor(self, place):
         self.assertTrue(egr_tensor9.place._equals(place))
         self.assertTrue(np.array_equal(egr_tensor9.numpy(), arr4))
 
+        x = np.random.rand(3, 3).astype('float32')
+        t = paddle.fluid.Tensor()
+        t.set(x, paddle.fluid.CPUPlace())
+        egr_tensor10 = core.eager.EagerTensor(t, place)
+        self.assertEqual(egr_tensor10.persistable, False)
+        self.assertTrue("generated_tensor" in egr_tensor10.name)
+        self.assertEqual(egr_tensor10.shape, [3, 3])
+        self.assertEqual(egr_tensor10.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor10.stop_gradient, True)
+        self.assertTrue(egr_tensor10.place._equals(place))
+        self.assertTrue(np.array_equal(egr_tensor10.numpy(), x))
+
+        egr_tensor11 = core.eager.EagerTensor(t, place, "framework_constructed")
+        self.assertEqual(egr_tensor11.persistable, False)
+        self.assertTrue("framework_constructed" in egr_tensor11.name)
+        self.assertEqual(egr_tensor11.shape, [3, 3])
+        self.assertEqual(egr_tensor11.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor11.stop_gradient, True)
+        self.assertTrue(egr_tensor11.place._equals(place))
+        self.assertTrue(np.array_equal(egr_tensor11.numpy(), x))
+
+        egr_tensor12 = core.eager.EagerTensor(t)
+        self.assertEqual(egr_tensor12.persistable, False)
+        self.assertTrue("generated_tensor" in egr_tensor12.name)
+        self.assertEqual(egr_tensor12.shape, [3, 3])
+        self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor12.stop_gradient, True)
+        self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
+        self.assertTrue(np.array_equal(egr_tensor12.numpy(), x))
+
+        with self.assertRaisesRegexp(
+                ValueError, "The shape of Parameter should not be None"):
+            eager_param = EagerParamBase(shape=None, dtype="float32")
+
+        with self.assertRaisesRegexp(
+                ValueError, "The dtype of Parameter should not be None"):
+            eager_param = EagerParamBase(shape=[1, 1], dtype=None)
+
+        with self.assertRaisesRegexp(
+                ValueError,
+                "The dimensions of shape for Parameter must be greater than 0"):
+            eager_param = EagerParamBase(shape=[], dtype="float32")
+
+        with self.assertRaisesRegexp(
+                ValueError,
+                "Each dimension of shape for Parameter must be greater than 0, but received /*"
+        ):
+            eager_param = EagerParamBase(shape=[-1], dtype="float32")
+
+        eager_param = EagerParamBase(shape=[1, 1], dtype="float32")
+        self.assertTrue(eager_param.trainable)
+        eager_param.trainable = False
+        self.assertFalse(eager_param.trainable)
+        with self.assertRaisesRegexp(
+                ValueError,
+                "The type of trainable MUST be bool, but the type is /*"):
+            eager_param.trainable = "False"
+
     def test_constructor(self):
         print("Test_constructor")
         paddle.set_device("cpu")
@@ -202,6 +289,316 @@ def test_constructor(self):
             for p in place_list:
                 self.constructor(p)
 
+    def constructor_with_kwargs(self, place):
+        # init EagerTensor by Python array
+        arr = np.random.rand(4, 16, 16, 32).astype('float32')
+
+        egr_tensor0 = core.eager.EagerTensor(value=arr)
+        self.assertEqual(egr_tensor0.persistable, False)
+        self.assertTrue("generated" in egr_tensor0.name)
+        self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32])
+        self.assertTrue(
+            egr_tensor0.place._equals(
+                paddle.fluid.framework._current_expected_place()))
+        self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor0.stop_gradient, True)
+
+        egr_tensor1 = core.eager.EagerTensor(value=arr, place=place)
+        self.assertEqual(egr_tensor1.persistable, False)
+        self.assertTrue("generated" in egr_tensor1.name)
+        self.assertEqual(egr_tensor1.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor1.place._equals(place))
+        self.assertEqual(egr_tensor1.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor1.stop_gradient, True)
+
+        egr_tensor2 = core.eager.EagerTensor(arr, place=place)
+        self.assertEqual(egr_tensor2.persistable, False)
+        self.assertTrue("generated" in egr_tensor2.name)
+        self.assertEqual(egr_tensor2.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor2.place._equals(place))
+        self.assertEqual(egr_tensor2.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor2.stop_gradient, True)
+
+        egr_tensor3 = core.eager.EagerTensor(
+            arr, place=place, name="new_eager_tensor")
+        self.assertEqual(egr_tensor3.persistable, False)
+        self.assertTrue("new_eager_tensor" in egr_tensor3.name)
+        self.assertEqual(egr_tensor3.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor3.place._equals(place))
+        self.assertEqual(egr_tensor3.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor3.stop_gradient, True)
+
+        egr_tensor4 = core.eager.EagerTensor(
+            arr, place=place, persistable=True, name="new_eager_tensor")
+        self.assertEqual(egr_tensor4.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor4.name)
+        self.assertEqual(egr_tensor4.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor4.place._equals(place))
+        self.assertEqual(egr_tensor4.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor4.stop_gradient, True)
+
+        egr_tensor5 = core.eager.EagerTensor(
+            arr,
+            core.CPUPlace(),
+            persistable=True,
+            name="new_eager_tensor",
+            zero_copy=True)
+        self.assertEqual(egr_tensor5.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor5.name)
+        self.assertEqual(egr_tensor5.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor5.place.is_cpu_place())
+        self.assertEqual(egr_tensor5.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor5.stop_gradient, True)
+
+        egr_tensor6 = core.eager.EagerTensor(
+            arr,
+            place=core.CPUPlace(),
+            persistable=True,
+            name="new_eager_tensor",
+            zero_copy=True)
+        self.assertEqual(egr_tensor6.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor6.name)
+        self.assertEqual(egr_tensor6.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor6.place.is_cpu_place())
+        self.assertEqual(egr_tensor6.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor6.stop_gradient, True)
+
+        egr_tensor7 = core.eager.EagerTensor(
+            arr,
+            place=place,
+            persistable=True,
+            name="new_eager_tensor",
+            zero_copy=True)
+        self.assertEqual(egr_tensor7.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor7.name)
+        self.assertEqual(egr_tensor7.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor7.place._equals(place))
+        self.assertEqual(egr_tensor7.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor7.stop_gradient, True)
+
+        egr_tensor8 = core.eager.EagerTensor(
+            arr,
+            place=place,
+            persistable=True,
+            name="new_eager_tensor",
+            zero_copy=True,
+            stop_gradient=False)
+        self.assertEqual(egr_tensor8.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor8.name)
+        self.assertEqual(egr_tensor8.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor8.place._equals(place))
+        self.assertEqual(egr_tensor8.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor8.stop_gradient, False)
+
+        egr_tensor9 = core.eager.EagerTensor(
+            arr, place, True, True, "new_eager_tensor", stop_gradient=False)
+        self.assertEqual(egr_tensor9.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor9.name)
+        self.assertEqual(egr_tensor9.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor9.place._equals(place))
+        self.assertEqual(egr_tensor9.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor9.stop_gradient, False)
+
+        egr_tensor10 = core.eager.EagerTensor(
+            arr,
+            place,
+            True,
+            True,
+            name="new_eager_tensor",
+            stop_gradient=False)
+        self.assertEqual(egr_tensor10.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor10.name)
+        self.assertEqual(egr_tensor10.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor10.place._equals(place))
+        self.assertEqual(egr_tensor10.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor10.stop_gradient, False)
+
+        egr_tensor11 = core.eager.EagerTensor(
+            arr,
+            place,
+            True,
+            zero_copy=True,
+            name="new_eager_tensor",
+            stop_gradient=False)
+        self.assertEqual(egr_tensor11.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor11.name)
+        self.assertEqual(egr_tensor11.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor11.place._equals(place))
+        self.assertEqual(egr_tensor11.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor11.stop_gradient, False)
+
+        egr_tensor12 = core.eager.EagerTensor(
+            arr,
+            place,
+            persistable=True,
+            zero_copy=True,
+            name="new_eager_tensor",
+            stop_gradient=False)
+        self.assertEqual(egr_tensor12.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor12.name)
+        self.assertEqual(egr_tensor12.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor12.place._equals(place))
+        self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor12.stop_gradient, False)
+
+        egr_tensor13 = core.eager.EagerTensor(
+            value=arr,
+            place=place,
+            persistable=True,
+            zero_copy=True,
+            name="new_eager_tensor",
+            stop_gradient=False)
+        self.assertEqual(egr_tensor13.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor13.name)
+        self.assertEqual(egr_tensor13.shape, [4, 16, 16, 32])
+        self.assertTrue(egr_tensor13.place._equals(place))
+        self.assertEqual(egr_tensor13.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor13.stop_gradient, False)
+
+        # special case
+        egr_tensor14 = core.eager.EagerTensor(
+            dtype=core.VarDesc.VarType.FP32,
+            dims=[4, 16, 16, 32],
+            name="special_eager_tensor",
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=True)
+        self.assertEqual(egr_tensor14.persistable, True)
+        self.assertEqual(egr_tensor14.name, "special_eager_tensor")
+        self.assertEqual(egr_tensor14.shape, [4, 16, 16, 32])
+        self.assertEqual(egr_tensor14.dtype, core.VarDesc.VarType.FP32)
+
+        # init EagerTensor by EagerTensor
+        egr_tensor15 = core.eager.EagerTensor(value=egr_tensor4)
+        self.assertEqual(egr_tensor15.persistable, True)
+        self.assertTrue("generated" in egr_tensor15.name)
+        self.assertEqual(egr_tensor15.shape, egr_tensor4.shape)
+        self.assertEqual(egr_tensor15.dtype, egr_tensor4.dtype)
+        self.assertEqual(egr_tensor15.stop_gradient, True)
+        self.assertTrue(
+            egr_tensor15.place._equals(
+                paddle.fluid.framework._current_expected_place()))
+        self.assertTrue(
+            np.array_equal(egr_tensor15.numpy(), egr_tensor4.numpy()))
+
+        egr_tensor16 = core.eager.EagerTensor(
+            value=egr_tensor4, name="new_eager_tensor")
+        self.assertEqual(egr_tensor16.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor16.name)
+        self.assertEqual(egr_tensor16.shape, egr_tensor4.shape)
+        self.assertEqual(egr_tensor16.dtype, egr_tensor4.dtype)
+        self.assertEqual(egr_tensor16.stop_gradient, True)
+        self.assertTrue(
+            egr_tensor16.place._equals(
+                paddle.fluid.framework._current_expected_place()))
+        self.assertTrue(
+            np.array_equal(egr_tensor16.numpy(), egr_tensor4.numpy()))
+
+        egr_tensor17 = core.eager.EagerTensor(
+            value=egr_tensor4,
+            place=place,
+            name="new_eager_tensor", )
+        self.assertEqual(egr_tensor17.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor17.name)
+        self.assertEqual(egr_tensor17.shape, egr_tensor4.shape)
+        self.assertEqual(egr_tensor17.dtype, egr_tensor4.dtype)
+        self.assertEqual(egr_tensor17.stop_gradient, True)
+        self.assertTrue(egr_tensor17.place._equals(place))
+        self.assertTrue(
+            np.array_equal(egr_tensor17.numpy(), egr_tensor4.numpy()))
+
+        egr_tensor18 = core.eager.EagerTensor(
+            egr_tensor4,
+            place=place,
+            name="new_eager_tensor", )
+        self.assertEqual(egr_tensor18.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor18.name)
+        self.assertEqual(egr_tensor18.shape, egr_tensor4.shape)
+        self.assertEqual(egr_tensor18.dtype, egr_tensor4.dtype)
+        self.assertEqual(egr_tensor18.stop_gradient, True)
+        self.assertTrue(egr_tensor18.place._equals(place))
+        self.assertTrue(
+            np.array_equal(egr_tensor18.numpy(), egr_tensor4.numpy()))
+
+        egr_tensor19 = core.eager.EagerTensor(
+            egr_tensor4,
+            place,
+            name="new_eager_tensor", )
+        self.assertEqual(egr_tensor19.persistable, True)
+        self.assertTrue("new_eager_tensor" in egr_tensor19.name)
+        self.assertEqual(egr_tensor19.shape, egr_tensor4.shape)
+        self.assertEqual(egr_tensor19.dtype, egr_tensor4.dtype)
+        self.assertEqual(egr_tensor19.stop_gradient, True)
+        self.assertTrue(egr_tensor19.place._equals(place))
+        self.assertTrue(
+            np.array_equal(egr_tensor19.numpy(), egr_tensor4.numpy()))
+
+        # init eager tensor by framework tensor
+        x = np.random.rand(3, 3).astype('float32')
+        t = paddle.fluid.Tensor()
+        t.set(x, paddle.fluid.CPUPlace())
+        egr_tensor20 = core.eager.EagerTensor(value=t)
+        self.assertEqual(egr_tensor20.persistable, False)
+        self.assertTrue("generated_tensor" in egr_tensor20.name)
+        self.assertEqual(egr_tensor20.shape, [3, 3])
+        self.assertEqual(egr_tensor20.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor20.stop_gradient, True)
+        self.assertTrue(
+            egr_tensor20.place._equals(
+                paddle.fluid.framework._current_expected_place()))
+        self.assertTrue(np.array_equal(egr_tensor20.numpy(), x))
+
+        egr_tensor21 = core.eager.EagerTensor(value=t, place=place)
+        self.assertEqual(egr_tensor21.persistable, False)
+        self.assertTrue("generated_tensor" in egr_tensor21.name)
+        self.assertEqual(egr_tensor21.shape, [3, 3])
+        self.assertEqual(egr_tensor21.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor21.stop_gradient, True)
+        self.assertTrue(egr_tensor21.place._equals(place))
+        self.assertTrue(np.array_equal(egr_tensor21.numpy(), x))
+
+        egr_tensor22 = core.eager.EagerTensor(t, place=place)
+        self.assertEqual(egr_tensor22.persistable, False)
+        self.assertTrue("generated_tensor" in egr_tensor22.name)
+        self.assertEqual(egr_tensor22.shape, [3, 3])
+        self.assertEqual(egr_tensor22.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor22.stop_gradient, True)
+        self.assertTrue(egr_tensor22.place._equals(place))
+        self.assertTrue(np.array_equal(egr_tensor22.numpy(), x))
+
+        egr_tensor23 = core.eager.EagerTensor(
+            t, place, name="from_framework_tensor")
+        self.assertEqual(egr_tensor23.persistable, False)
+        self.assertTrue("from_framework_tensor" in egr_tensor23.name)
+        self.assertEqual(egr_tensor23.shape, [3, 3])
+        self.assertEqual(egr_tensor23.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor23.stop_gradient, True)
+        self.assertTrue(egr_tensor23.place._equals(place))
+        self.assertTrue(np.array_equal(egr_tensor23.numpy(), x))
+
+        egr_tensor24 = core.eager.EagerTensor(
+            value=t, place=place, name="from_framework_tensor")
+        self.assertEqual(egr_tensor24.persistable, False)
+        self.assertTrue("from_framework_tensor" in egr_tensor24.name)
+        self.assertEqual(egr_tensor24.shape, [3, 3])
+        self.assertEqual(egr_tensor24.dtype, core.VarDesc.VarType.FP32)
+        self.assertEqual(egr_tensor24.stop_gradient, True)
+        self.assertTrue(egr_tensor24.place._equals(place))
+        self.assertTrue(np.array_equal(egr_tensor24.numpy(), x))
+
+        # Bad usage
+        # SyntaxError: positional argument follows keyword argument
+        # egr_tensor25 = core.eager.EagerTensor(value=t, place) 
+
+    def test_constructor_with_kwargs(self):
+        print("Test_constructor_with_kwargs")
+        paddle.set_device("cpu")
+        place_list = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            place_list.append(core.CUDAPlace(0))
+        with _test_eager_guard():
+            for p in place_list:
+                self.constructor_with_kwargs(p)
+
     def test_copy_and_copy_to(self):
         print("Test_copy_and_copy_to")
         with _test_eager_guard():
@@ -223,7 +620,7 @@ def test_copy_and_copy_to(self):
             self.assertTrue(np.array_equal(tensor.numpy(), arr))
             print("Test copy_")
             tensor.copy_(tensor1, True)
-            self.assertEqual(tensor.persistable, True)
+            self.assertEqual(tensor.persistable, False)
             self.assertEqual(tensor.shape, [4, 16])
             self.assertEqual(tensor.dtype, core.VarDesc.VarType.FP32)
             self.assertTrue(np.array_equal(tensor.numpy(), arr1))
@@ -291,5 +688,104 @@ def test_place_guard(self):
         core._disable_eager_mode()
 
 
+class EagerParamBaseUsageTestCase(unittest.TestCase):
+    def test_print(self):
+        with _test_eager_guard():
+            linear = paddle.nn.Linear(3, 3, bias_attr=False)
+            print(linear.weight)
+
+    def test_copy(self):
+        with _test_eager_guard():
+            linear = paddle.nn.Linear(1, 3)
+            linear_copy = copy.deepcopy(linear)
+            linear_copy2 = linear.weight._copy_to(core.CPUPlace(), True)
+            self.assertTrue(
+                np.array_equal(linear.weight.numpy(),
+                               linear_copy.weight.numpy()))
+            self.assertTrue(
+                np.array_equal(linear.weight.numpy(), linear_copy2.numpy()))
+
+    def func_fp16_initilaizer(self):
+        paddle.set_default_dtype("float16")
+        linear1 = paddle.nn.Linear(1, 3, bias_attr=False)
+        linear2 = paddle.nn.Linear(
+            1,
+            3,
+            bias_attr=False,
+            weight_attr=paddle.fluid.initializer.Uniform())
+        linear3 = paddle.nn.Linear(
+            1,
+            3,
+            bias_attr=False,
+            weight_attr=paddle.fluid.initializer.TruncatedNormalInitializer())
+        linear4 = paddle.nn.Linear(
+            1,
+            3,
+            bias_attr=False,
+            weight_attr=paddle.fluid.initializer.MSRAInitializer())
+        res = [
+            linear1.weight.numpy(), linear2.weight.numpy(),
+            linear3.weight.numpy(), linear4.weight.numpy()
+        ]
+        paddle.set_default_dtype("float32")
+        return res
+
+    def test_fp16_initializer(self):
+        res1 = list()
+        res2 = list()
+        paddle.seed(102)
+        paddle.framework.random._manual_program_seed(102)
+        with _test_eager_guard():
+            res1 = self.func_fp16_initilaizer()
+        res2 = self.func_fp16_initilaizer()
+
+        for i in range(len(res1)):
+            self.assertTrue(np.array_equal(res1[i], res2[i]))
+
+    def func_layer_helper_base(self, value):
+        base = paddle.fluid.layer_helper_base.LayerHelperBase("test_layer",
+                                                              "test_layer")
+        return base.to_variable(value).numpy()
+
+    def func_base_to_variable(self, value):
+        paddle.fluid.dygraph.base.to_variable(value)
+
+    def test_to_variable(self):
+        value = np.random.rand(4, 16, 16, 32).astype('float32')
+        res1 = None
+        res3 = None
+        with _test_eager_guard():
+            res1 = self.func_layer_helper_base(value)
+            res3 = self.func_base_to_variable(value)
+        res2 = self.func_layer_helper_base(value)
+        res4 = self.func_base_to_variable(value)
+        self.assertTrue(np.array_equal(res1, res2))
+        self.assertTrue(np.array_equal(res3, res4))
+
+    def test_backward_with_single_tensor(self):
+        with _test_eager_guard():
+            arr4 = np.random.rand(4, 16, 16, 32).astype('float32')
+            egr_tensor12 = core.eager.EagerTensor(arr4, core.CPUPlace())
+            egr_tensor12.retain_grads()
+            arr = np.ones([4, 16, 16, 32]).astype('float32')
+            self.assertEqual(egr_tensor12.persistable, False)
+            self.assertTrue("generated_tensor" in egr_tensor12.name)
+            self.assertEqual(egr_tensor12.shape, [4, 16, 16, 32])
+            self.assertEqual(egr_tensor12.dtype, core.VarDesc.VarType.FP32)
+            self.assertEqual(egr_tensor12.stop_gradient, True)
+            self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
+            self.assertTrue(np.array_equal(egr_tensor12.numpy(), arr4))
+            self.assertTrue(np.array_equal(egr_tensor12.gradient(), None))
+            egr_tensor12.backward()
+            self.assertTrue(np.array_equal(egr_tensor12.gradient(), arr))
+
+
+class EagerGuardTestCase(unittest.TestCase):
+    def test__test_eager_guard(self):
+        tracer = paddle.fluid.dygraph.tracer.Tracer()
+        with _test_eager_guard(tracer):
+            self.assertTrue(_in_eager_mode())
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_erfinv_op.py b/python/paddle/fluid/tests/unittests/test_erfinv_op.py
new file mode 100644
index 0000000000000..847a868dd6ca0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_erfinv_op.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from scipy.special import erfinv
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+paddle.enable_static()
+np.random.seed(0)
+
+
+class TestErfinv(OpTest):
+    def setUp(self):
+        self.op_type = "erfinv"
+        self.init_dtype()
+        self.shape = [11, 17]
+        self.x = np.random.uniform(-1, 1, size=self.shape).astype(self.dtype)
+        self.res_ref = erfinv(self.x).astype(self.dtype)
+        self.grad_out = np.ones(self.shape, self.dtype)
+        self.gradient = np.sqrt(np.pi) / 2 * np.exp(np.square(
+            self.res_ref)) * self.grad_out
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': self.res_ref}
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[self.gradient],
+            user_defined_grad_outputs=self.grad_out)
+
+
+class TestErfinvFP32(TestErfinv):
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestErfinvAPI(unittest.TestCase):
+    def init_dtype(self):
+        self.dtype = 'float32'
+
+    def setUp(self):
+        self.init_dtype()
+        self.x = np.random.rand(5).astype(self.dtype)
+        self.res_ref = erfinv(self.x)
+        self.place = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def test_static_api(self):
+        paddle.enable_static()
+
+        def run(place):
+            with paddle.static.program_guard(paddle.static.Program()):
+                x = paddle.fluid.data('x', [1, 5], dtype=self.dtype)
+                out = paddle.erfinv(x)
+                exe = paddle.static.Executor(place)
+                res = exe.run(feed={'x': self.x.reshape([1, 5])})
+            for r in res:
+                self.assertEqual(np.allclose(self.res_ref, r), True)
+
+        for place in self.place:
+            run(place)
+
+    def test_dygraph_api(self):
+        def run(place):
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self.x)
+            out = paddle.erfinv(x)
+            self.assertEqual(np.allclose(self.res_ref, out.numpy()), True)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+    def test_inplace_api(self):
+        def run(place):
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self.x)
+            x.erfinv_()
+            self.assertEqual(np.allclose(self.res_ref, x.numpy()), True)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
new file mode 100644
index 0000000000000..7d43ebadf41bb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -0,0 +1,211 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+from op_test import OpTest
+
+paddle.enable_static()
+paddle.seed(100)
+
+
+class TestExponentialOp1(OpTest):
+    def setUp(self):
+        self.op_type = "exponential"
+        self.config()
+
+        self.attrs = {"lambda": self.lam}
+        self.inputs = {'X': np.empty([1024, 1024], dtype=self.dtype)}
+        self.outputs = {'Out': np.ones([1024, 1024], dtype=self.dtype)}
+
+    def config(self):
+        self.lam = 0.5
+        self.dtype = "float64"
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        hist1, _ = np.histogram(outs[0], range=(0, 5))
+        hist1 = hist1.astype("float32")
+        hist1 = hist1 / float(outs[0].size)
+
+        data_np = np.random.exponential(1. / self.lam, [1024, 1024])
+        hist2, _ = np.histogram(data_np, range=(0, 5))
+        hist2 = hist2.astype("float32")
+        hist2 = hist2 / float(data_np.size)
+
+        self.assertTrue(
+            np.allclose(
+                hist1, hist2, rtol=0.02),
+            "actual: {}, expected: {}".format(hist1, hist2))
+
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[np.zeros(
+                [1024, 1024], dtype=self.dtype)],
+            user_defined_grad_outputs=[
+                np.random.rand(1024, 1024).astype(self.dtype)
+            ])
+
+
+class TestExponentialOp2(TestExponentialOp1):
+    def config(self):
+        self.lam = 0.25
+        self.dtype = "float32"
+
+
+class TestExponentialAPI(unittest.TestCase):
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x_np = np.full([10, 10], -1.)
+            x = paddle.static.data(name="X", shape=[10, 10], dtype='float64')
+            x.exponential_(1.0)
+
+            exe = paddle.static.Executor()
+            out = exe.run(paddle.static.default_main_program(),
+                          feed={"X": x_np},
+                          fetch_list=[x])
+            self.assertTrue(np.min(out) >= 0)
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.full([10, 10], -1., dtype='float32')
+        x.exponential_(0.5)
+        self.assertTrue(np.min(x.numpy()) >= 0)
+        paddle.enable_static()
+
+    # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+    def test_fixed_random_number(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        # Note(zhouwei): The Number of threads is determined by 
+        # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different 
+        # GPU have different number of threads, which result in different 
+        # random value. Only test on V100 GPU here.
+        if not "V100" in paddle.device.cuda.get_device_name():
+            return
+
+        print("Test Fixed Random number on V100 GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+
+        x = paddle.empty([64, 3, 1024, 1024], dtype="float32")
+        x.exponential_(1.0)
+        x_np = x.numpy()
+        expect = [
+            0.80073667, 0.2249291, 0.07734892, 1.25392, 0.14013891, 0.45736602,
+            1.9735607, 0.30490234, 0.57100505, 0.8115938
+        ]
+
+        self.assertTrue(np.allclose(x_np[0, 0, 0, 0:10], expect))
+        expect = [
+            1.4296371e+00, 9.5411777e-01, 5.2575850e-01, 2.4805880e-01,
+            1.2322118e-04, 8.4604341e-01, 2.1111444e-01, 1.4143821e+00,
+            2.8194717e-01, 1.1360573e+00
+        ]
+        self.assertTrue(np.allclose(x_np[16, 1, 300, 200:210], expect))
+        expect = [
+            1.3448033, 0.35146526, 1.7380928, 0.32012638, 0.10396296,
+            0.51344526, 0.15308502, 0.18712929, 0.03888268, 0.20771872
+        ]
+        self.assertTrue(np.allclose(x_np[32, 1, 600, 500:510], expect))
+        expect = [
+            0.5107464, 0.20970327, 2.1986802, 1.580056, 0.31036147, 0.43966478,
+            0.9056133, 0.30119267, 1.4797124, 1.4319834
+        ]
+        self.assertTrue(np.allclose(x_np[48, 2, 900, 800:810], expect))
+        expect = [
+            3.4640615, 1.1019983, 0.41195083, 0.22681557, 0.291846, 0.53617656,
+            1.5791925, 2.4645927, 0.04094889, 0.9057725
+        ]
+        self.assertTrue(np.allclose(x_np[63, 2, 1023, 1000:1010], expect))
+
+        x = paddle.empty([10, 10], dtype="float32")
+        x.exponential_(3.0)
+        x_np = x.numpy()
+        expect = [
+            0.02831675, 0.1691551, 0.6798956, 0.69347525, 0.0243443, 0.22180498,
+            0.30574575, 0.9839696, 0.2834912, 0.59420055
+        ]
+        self.assertTrue(np.allclose(x_np[5, 0:10], expect))
+
+        x = paddle.empty([16, 2, 1024, 768], dtype="float64")
+        x.exponential_(0.25)
+        x_np = x.numpy()
+        expect = [
+            10.0541229, 12.67860643, 1.09850734, 7.35289643, 2.65471225,
+            3.86217432, 2.97902086, 2.92744479, 2.67927152, 0.19667352
+        ]
+        self.assertTrue(np.allclose(x_np[0, 0, 0, 100:110], expect))
+        expect = [
+            0.68328125, 3.1454553, 0.92158376, 1.95842188, 1.05296941,
+            12.93242051, 5.20255978, 3.3588624, 1.57377174, 5.73194183
+        ]
+        self.assertTrue(np.allclose(x_np[4, 0, 300, 190:200], expect))
+        expect = [
+            1.37973974, 3.45036798, 7.94625406, 1.62610973, 0.31032122,
+            4.13596493, 1.98494535, 1.13207041, 8.30592769, 2.81460147
+        ]
+        self.assertTrue(np.allclose(x_np[8, 1, 600, 300:310], expect))
+        expect = [
+            2.27710811, 12.25003028, 2.96409124, 4.72405788, 0.67917249,
+            4.35856718, 0.46870976, 2.31120149, 9.61595826, 4.64446271
+        ]
+        self.assertTrue(np.allclose(x_np[12, 1, 900, 500:510], expect))
+        expect = [
+            0.95883744, 1.57316361, 15.22524512, 20.49559882, 13.70008548,
+            3.29430143, 3.90390424, 0.9146657, 0.80972249, 0.33376219
+        ]
+        self.assertTrue(np.allclose(x_np[15, 1, 1023, 750:760], expect))
+
+        x = paddle.empty([512, 768], dtype="float64")
+        x.exponential_(0.3)
+        x_np = x.numpy()
+        expect = [
+            8.79266704, 4.79596009, 2.75480243, 6.04670011, 0.35379556,
+            0.76864868, 3.17428251, 0.26556859, 12.22485885, 10.51690383
+        ]
+        self.assertTrue(np.allclose(x_np[0, 200:210], expect))
+        expect = [
+            5.6341126, 0.52243418, 5.36410796, 6.83672002, 11.9243311,
+            5.85985566, 5.75169548, 0.13877972, 6.1348385, 3.82436519
+        ]
+        self.assertTrue(np.allclose(x_np[300, 400:410], expect))
+        expect = [
+            4.94883581, 0.56345306, 0.85841585, 1.92287801, 6.10036656,
+            1.19524847, 3.64735434, 5.19618716, 2.57467974, 3.49152791
+        ]
+        self.assertTrue(np.allclose(x_np[500, 700:710], expect))
+
+        x = paddle.empty([10, 10], dtype="float64")
+        x.exponential_(4.0)
+        x_np = x.numpy()
+        expect = [
+            0.15713826, 0.56395964, 0.0680941, 0.00316643, 0.27046853,
+            0.19852724, 0.12776634, 0.09642974, 0.51977551, 1.33739699
+        ]
+        self.assertTrue(np.allclose(x_np[5, 0:10], expect))
+
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
index 496f3505ec41b..f6cb23c4beaa9 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
@@ -24,6 +24,7 @@
 from paddle.dataset.common import DATA_HOME
 from paddle.fluid.framework import core, in_dygraph_mode
 from paddle.fluid.layer_helper import LayerHelper
+from paddle import _C_ops
 
 import sys
 sys.path.append("./tokenizer")
@@ -75,7 +76,7 @@ def forward(self,
                 is_split_into_words=False,
                 pad_to_max_seq_len=False):
         if in_dygraph_mode():
-            input_ids, seg_ids = core.ops.faster_tokenizer(
+            input_ids, seg_ids = _C_ops.faster_tokenizer(
                 self.vocab, text, text_pair, "do_lower_case", do_lower_case,
                 "max_seq_len", max_seq_len, "pad_to_max_seq_len",
                 pad_to_max_seq_len, "is_split_into_words", is_split_into_words)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
index 3078e5b3d100e..9675a77d6766b 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
@@ -24,9 +24,9 @@ class TestFleetBase(unittest.TestCase):
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
         os.environ["PADDLE_PORT"] = "36000"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-            "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        #os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+        #    "127.0.0.1:36001,127.0.0.2:36001"
 
     def test_ps_minimize(self):
         import paddle
@@ -78,45 +78,6 @@ def test_ps_minimize(self):
         fleet.load_model(path="/tmp", mode=0)
         fleet.load_model(path="/tmp", mode=1)
 
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['x', 'y'],
-            target_vars=[avg_cost],
-            executor="exe")
-
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='/tmp/',
-            feeded_var_names=['x', 'y'],
-            target_vars=[avg_cost],
-            executor=exe,
-            main_program=compiled_prog)
-
-        self.assertRaises(
-            Exception,
-            fleet.save_inference_model,
-            dirname='afs:/tmp/',
-            feeded_var_names=['x', 'y'],
-            target_vars=[avg_cost],
-            executor=exe,
-            main_program=compiled_prog)
-
-        self.assertRaises(
-            Exception, fleet.save_persistables, executor=pe, dirname='/tmp/')
-
-        self.assertRaises(
-            Exception, fleet.save_persistables, executor="exe", dirname='/tmp/')
-
-        self.assertRaises(
-            Exception,
-            fleet.save_persistables,
-            executor=exe,
-            dirname='/tmp/',
-            main_program=compiled_prog)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index c7eaf4e0ff33d..42ec81ad9d869 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -190,6 +190,53 @@ def test_sharding_amp_recompute_optimizer(self):
             'momentum', 'momentum'
         ])
 
+    def test_sharding_amp_asp_optimizer(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        self.set_strategy(strategy, 'amp')
+        self.set_strategy(strategy, 'asp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        vars = [x.name for x in train_prog.list_vars()]
+        parameters = [
+            x.name for x in train_prog.list_vars() if x.persistable == True
+        ]
+
+        self.assertIn('@BroadCast', ''.join(vars))
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+        self.assertEqual(
+            set(parameters),
+            set([
+                'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0',
+                'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask',
+                'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask',
+                'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0',
+                'fc_2.b_0_velocity_0'
+            ]))
+        self.assertEqual(ops, [
+            'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
+            'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast',
+            'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
+            'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
+            'mean', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
+            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast',
+            'cast', 'cast', 'check_finite_and_unscale', 'cast',
+            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
+            'momentum', 'momentum', 'elementwise_mul'
+        ])
+
     def test_sharding_weight_decay(self):
         train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
         )
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
new file mode 100644
index 0000000000000..14a59b413383f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -0,0 +1,204 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import math
+import numpy as np
+import unittest
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+class TestFoldOp(OpTest):
+    """
+    This is for test on fold Op
+    """
+
+    def init_data(self):
+        self.batch_size = 3
+        self.input_channels = 3 * 2 * 2
+        self.length = 12
+        self.kernel_sizes = [2, 2]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+        self.dilations = [1, 1]
+        self.output_sizes = [4, 5]
+        input_shape = [self.batch_size, self.input_channels, self.length]
+        self.x = np.random.rand(*input_shape).astype(np.float64)
+
+    def calc_fold(self):
+        output_shape = [0] * 4
+        output_shape[0] = self.batch_size
+        output_shape[1] = int(self.input_channels /
+                              (self.kernel_sizes[0] * self.kernel_sizes[1]))
+        output_shape[2] = self.output_sizes[0]
+        output_shape[3] = self.output_sizes[1]
+        dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1
+        dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
+        col_height = int((self.output_sizes[0] + self.paddings[0] +
+                          self.paddings[2] - dkernel_h) / self.strides[0]) + 1
+        col_width = int((self.output_sizes[1] + self.paddings[1] +
+                         self.paddings[3] - dkernel_w) / self.strides[1]) + 1
+        output = np.zeros(output_shape).astype(np.float64)
+        ############ calculate output ##############
+        for b in range(output_shape[0]):
+            for c in range(self.input_channels):
+                w_offset = int(c % self.kernel_sizes[1])
+                h_offset = int(
+                    (c / self.kernel_sizes[1]) % self.kernel_sizes[0])
+                c_out = int(c / self.kernel_sizes[0] / self.kernel_sizes[1])
+                for h in range(col_height):
+                    h_out = int(h * self.strides[0] - self.paddings[0] +
+                                h_offset * self.dilations[0])
+                    for w in range(col_width):
+                        w_out = int(w * self.strides[1] - self.paddings[1] +
+                                    w_offset * self.dilations[1])
+                        if (h_out >= 0 and h_out < self.output_sizes[0]) and (
+                                w_out >= 0 and w_out < self.output_sizes[1]):
+                            output[b, c_out, h_out, w_out] += self.x[
+                                b, c, w + col_width * h]
+
+        self.outputs = output
+
+    def set_data(self):
+        self.init_data()
+        self.calc_fold()
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
+        self.attrs = {
+            'kernel_sizes': self.kernel_sizes,
+            'paddings': self.paddings,
+            'dilations': self.dilations,
+            'strides': self.strides,
+            'output_sizes': self.output_sizes
+        }
+        self.outputs = {'Y': self.outputs}
+
+    def setUp(self):
+        self.op_type = 'fold'
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y')
+
+
+class TestFoldAPI(TestFoldOp):
+
+    #This is for test on paddle.nn.Fold
+
+    def setUp(self):
+        self.op_type = 'fold'
+        self.set_data()
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_api(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input = paddle.to_tensor(self.x)
+                m = paddle.nn.Fold(**self.attrs)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), self.outputs['Y']))
+
+    def test_info(self):
+        str(paddle.nn.Fold(**self.attrs))
+
+
+class TestFoldOpError(unittest.TestCase):
+    def test_errors(self):
+        from paddle.nn.functional import fold
+        from paddle.fluid.framework import Program, program_guard
+        with program_guard(Program(), Program()):
+
+            def test_input_shape():
+                # input_shpae must be 3-D
+                x = paddle.randn(shape=[2, 3, 6, 7], dtype="float32")
+                out = fold(x, output_sizes=[2, 3], kernel_sizes=[2, 2])
+
+            def test_kernel_shape():
+                # kernel_size must be 2
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(x, output_sizes=[2, 3], kernel_sizes=[2, 2, 3])
+
+            def test_padding_shape():
+                # padding_size must be 2 or 4
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    paddings=[2, 2, 3])
+
+            def test_dilations_shape():
+                # dialtions_size must be 2 
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    dilations=[2, 2, 3])
+
+            def test_strides_shape():
+                # strids_size must be 2
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    strides=[2, 2, 3])
+
+            def test_output_size():
+                # im_h * im_w must be L
+                x = paddle.randn(shape=[2, 6, 6], dtype="float32")
+                out = fold(
+                    x, output_sizes=[6, 6], kernel_sizes=[2, 2],
+                    strides=[1, 1])
+
+            def test_block_h_w():
+                # test_block_h_w GT 0
+                x = paddle.randn(shape=[2, 1, 1], dtype="float32")
+                out = fold(
+                    x, output_sizes=[1, 1], kernel_sizes=[2, 2], strides=1)
+
+            def test_GT_0():
+                x = paddle.randn(shape=[2, 1, 1], dtype="float32")
+                out = fold(
+                    x,
+                    output_sizes=[0, 0],
+                    kernel_sizes=[0, 0],
+                    dilations=0,
+                    paddings=[0, 0],
+                    strides=0)
+
+            self.assertRaises(AssertionError, test_input_shape)
+            self.assertRaises(AssertionError, test_kernel_shape)
+            self.assertRaises(ValueError, test_padding_shape)
+            self.assertRaises(AssertionError, test_dilations_shape)
+            self.assertRaises(AssertionError, test_strides_shape)
+            self.assertRaises(ValueError, test_output_size)
+            self.assertRaises(ValueError, test_block_h_w)
+            self.assertRaises(ValueError, test_GT_0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 2938eabd07b9c..a8ed23f5938c0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -598,6 +598,32 @@ def test_set_master_weight(self):
         self.assertEqual(optimizers[0]._multi_precision, False)
         self.assertEqual(optimizers[1]._multi_precision, False)
 
+    def test_skip_BatchNorm_Layer_norm(self):
+        model = paddle.nn.LayerNorm(1)
+        model = paddle.amp.decorate(models=model, level='O2')
+        for param in model.parameters():
+            self.assertEqual((param.dtype == paddle.float32), True)
+
+        model = paddle.nn.BatchNorm(1)
+        model = paddle.amp.decorate(models=model, level='O2')
+        for param in model.parameters():
+            self.assertEqual((param.dtype == paddle.float32), True)
+
+        model = paddle.nn.BatchNorm1D(1)
+        model = paddle.amp.decorate(models=model, level='O2')
+        for param in model.parameters():
+            self.assertEqual((param.dtype == paddle.float32), True)
+
+        model = paddle.nn.BatchNorm2D(1)
+        model = paddle.amp.decorate(models=model, level='O2')
+        for param in model.parameters():
+            self.assertEqual((param.dtype == paddle.float32), True)
+
+        model = paddle.nn.BatchNorm3D(1)
+        model = paddle.amp.decorate(models=model, level='O2')
+        for param in model.parameters():
+            self.assertEqual((param.dtype == paddle.float32), True)
+
 
 class TestPureFp16SaveLoad(unittest.TestCase):
     def test_save_dtype_exception(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index d6835069b9d2a..07a8ae0ba0f9f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -24,6 +24,7 @@
 import paddle.fluid.dygraph_utils as dygraph_utils
 from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
 import paddle
+from paddle.fluid.framework import _test_eager_guard, _in_eager_mode, in_dygraph_mode
 
 
 class MyLayer(fluid.Layer):
@@ -93,58 +94,13 @@ def __init__(self, step_input_size, hidden_size, output_size, param_attr):
             is_bias=False)
 
     def forward(self, input, pre_hidden):
-        tmp_i2h = self.create_variable(dtype=self._dtype)
-        tmp_h2h = self.create_variable(dtype=self._dtype)
-        hidden = self.create_variable(dtype=self._dtype)
-        out = self.create_variable(dtype=self._dtype)
-        softmax_out = self.create_variable(dtype=self._dtype)
-        reduce_out = self.create_variable(dtype=self._dtype)
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": input,
-                    "Y": self._i2h_w},
-            outputs={"Out": tmp_i2h},
-            attrs={"x_num_col_dims": 1,
-                   "y_num_col_dims": 1})
-
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": pre_hidden,
-                    "Y": self._h2h_w},
-            outputs={"Out": tmp_h2h},
-            attrs={"x_num_col_dims": 1,
-                   "y_num_col_dims": 1})
-
-        self._helper.append_op(
-            type="elementwise_add",
-            inputs={'X': tmp_h2h,
-                    'Y': tmp_i2h},
-            outputs={'Out': hidden},
-            attrs={'axis': -1,
-                   'use_mkldnn': False})
+        tmp_i2h = paddle.fluid.layers.nn.mul(input, self._i2h_w)
+        tmp_h2h = paddle.fluid.layers.nn.mul(pre_hidden, self._h2h_w)
+        hidden = paddle.add(tmp_h2h, tmp_i2h)
         hidden = self._helper.append_activation(hidden, act='tanh')
-
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": hidden,
-                    "Y": self._h2o_w},
-            outputs={"Out": out},
-            attrs={"x_num_col_dims": 1,
-                   "y_num_col_dims": 1})
-
-        self._helper.append_op(
-            type="softmax",
-            inputs={"X": out},
-            outputs={"Out": softmax_out},
-            attrs={"use_cudnn": False})
-
-        self._helper.append_op(
-            type='reduce_sum',
-            inputs={'X': softmax_out},
-            outputs={'Out': reduce_out},
-            attrs={'keep_dim': False,
-                   'reduce_all': True})
-
+        out = paddle.fluid.layers.nn.mul(hidden, self._h2o_w)
+        softmax_out = paddle.nn.functional.softmax(out)
+        reduce_out = paddle.fluid.layers.nn.reduce_sum(softmax_out)
         return reduce_out, hidden
 
 
@@ -180,12 +136,12 @@ def forward(self, inputs):
 
 
 class TestImperative(unittest.TestCase):
-    def test_functional_dygraph_context(self):
+    def functional_dygraph_context(self):
         self.assertFalse(fluid.dygraph.enabled())
         fluid.enable_dygraph()
         self.assertTrue(fluid.dygraph.enabled())
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        var_inp = fluid.dygraph.base.to_variable(np_inp)
+        var_inp = paddle.to_tensor(np_inp)
         mlp = MLP(input_size=2)
         out = mlp(var_inp)
         dy_out1 = out.numpy()
@@ -195,7 +151,7 @@ def test_functional_dygraph_context(self):
         self.assertFalse(fluid.dygraph.enabled())
         with fluid.dygraph.guard():
             self.assertTrue(fluid.dygraph.enabled())
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
+            var_inp = paddle.to_tensor(np_inp)
             mlp = MLP(input_size=2)
             out = mlp(var_inp)
             dy_out2 = out.numpy()
@@ -205,7 +161,12 @@ def test_functional_dygraph_context(self):
         self.assertTrue(np.array_equal(dy_out1, dy_out2))
         self.assertTrue(np.array_equal(dy_grad1, dy_grad2))
 
-    def test_functional_paddle_imperative_dygraph_context(self):
+    def test_functional_dygraph_context(self):
+        with _test_eager_guard():
+            self.functional_dygraph_context()
+        self.functional_dygraph_context()
+
+    def functional_paddle_imperative_dygraph_context(self):
         self.assertFalse(paddle.in_dynamic_mode())
         paddle.disable_static()
         self.assertTrue(paddle.in_dynamic_mode())
@@ -231,23 +192,53 @@ def test_functional_paddle_imperative_dygraph_context(self):
         self.assertTrue(np.array_equal(dy_out1, dy_out2))
         self.assertTrue(np.array_equal(dy_grad1, dy_grad2))
 
-    def test_isinstance(self):
+    def test_functional_paddle_imperative_dygraph_context(self):
+        with _test_eager_guard():
+            self.functional_paddle_imperative_dygraph_context()
+        self.functional_paddle_imperative_dygraph_context()
+
+    def func_isinstance(self):
         var = fluid.layers.data(shape=[1], name='x', dtype='float32')
         self.assertTrue(isinstance(var, fluid.Variable))
         with fluid.dygraph.guard():
-            var_base = fluid.dygraph.base.to_variable(np.array([3, 4, 5]))
-            self.assertTrue(isinstance(var_base, core.VarBase))
-            self.assertTrue(isinstance(var_base, fluid.Variable))
+            if fluid.framework._in_eager_mode():
+                var_base = paddle.to_tensor(np.array([3, 4, 5]))
+                self.assertTrue(isinstance(var_base, core.eager.EagerTensor))
+            else:
+                var_base = paddle.to_tensor(np.array([3, 4, 5]))
+                self.assertTrue(isinstance(var_base, core.VarBase))
+                self.assertTrue(isinstance(var_base, fluid.Variable))
+
+    def test_isinstance(self):
+        with _test_eager_guard():
+            self.func_isinstance()
+        self.func_isinstance()
 
-    def test_create_VarBase(self):
+    def func_create_varbase(self):
         x = np.ones([2, 2], np.float32)
         y = np.zeros([3, 3], np.float32)
         t = fluid.Tensor()
         t.set(x, fluid.CPUPlace())
-        with fluid.dygraph.guard():
+        if _in_eager_mode():
+            # TODO(jiabin): Support Kwargs and uncomment these tests
+            # egr_tmp = fluid.core.eager.EagerTensor(value=x, place=fluid.core.CPUPlace())
+            egr_tmp2 = fluid.core.eager.EagerTensor(y, fluid.core.CPUPlace())
+            egr_tmp3 = paddle.to_tensor(x)
+            egr_tmp4 = fluid.core.eager.EagerTensor(y)
+            # egr_tmp5 = fluid.core.eager.EagerTensor(value=x)
+            # TODO(jiabin): Support it when we merge LoDTensor with DenseTensor
+            egr_tmp6 = fluid.core.eager.EagerTensor(t)
+
+            # self.assertTrue(np.array_equal(x, egr_tmp.numpy()))
+            self.assertTrue(np.array_equal(y, egr_tmp2.numpy()))
+            self.assertTrue(np.array_equal(x, egr_tmp3.numpy()))
+            self.assertTrue(np.array_equal(y, egr_tmp4.numpy()))
+            # self.assertTrue(np.array_equal(x, egr_tmp5.numpy()))
+            self.assertTrue(np.array_equal(x, egr_tmp6.numpy()))
+        else:
             tmp = fluid.core.VarBase(value=x, place=fluid.core.CPUPlace())
             tmp2 = fluid.core.VarBase(y, fluid.core.CPUPlace())
-            tmp3 = fluid.dygraph.base.to_variable(x)
+            tmp3 = paddle.to_tensor(x)
             tmp4 = fluid.core.VarBase(y)
             tmp5 = fluid.core.VarBase(value=x)
             tmp6 = fluid.core.VarBase(t)
@@ -259,6 +250,12 @@ def test_create_VarBase(self):
             self.assertTrue(np.array_equal(x, tmp5.numpy()))
             self.assertTrue(np.array_equal(x, tmp6.numpy()))
 
+    def test_create_varbase(self):
+        with fluid.dygraph.guard():
+            with _test_eager_guard():
+                self.func_create_varbase()
+            self.func_create_varbase()
+
     def test_no_grad_guard(self):
         data = np.array([[2, 3], [4, 5]]).astype('float32')
         with fluid.dygraph.guard():
@@ -269,8 +266,8 @@ def test_no_grad_guard(self):
                 self.assertTrue(l1.weight.stop_gradient is False)
                 tmp = l1.weight * 2
                 self.assertTrue(tmp.stop_gradient)
-            x = fluid.dygraph.to_variable(data)
-            y = l0(x) + tmp
+            x = paddle.to_tensor(data)
+            y = paddle.add(l0(x), tmp)
             o = l1(y)
             o.backward()
 
@@ -287,8 +284,8 @@ def test_paddle_imperative_no_grad_guard(self):
                 self.assertTrue(l1.weight.stop_gradient is False)
                 tmp = l1.weight * 2
                 self.assertTrue(tmp.stop_gradient)
-            x = fluid.dygraph.to_variable(data)
-            y = l0(x) + tmp
+            x = paddle.to_tensor(data)
+            y = paddle.add(l0(x), tmp)
             o = l1(y)
             o.backward()
 
@@ -308,8 +305,8 @@ def test_paddle_imperative_set_grad_enabled(self):
                     tmp2 = l1.weight * 2
                 self.assertTrue(tmp.stop_gradient)
                 self.assertTrue(tmp2.stop_gradient is False)
-            x = fluid.dygraph.to_variable(data)
-            y = l0(x) + tmp2
+            x = paddle.to_tensor(data)
+            y = paddle.add(l0(x), tmp2)
             o = l1(y)
             o.backward()
 
@@ -317,24 +314,31 @@ def test_paddle_imperative_set_grad_enabled(self):
             self.assertTrue(tmp2._grad_ivar() is not None)
             self.assertTrue(l0.weight._grad_ivar() is not None)
 
-    def test_sum_op(self):
+    def test_paddle_imperative_is_grad_enabled(self):
+        with fluid.dygraph.guard():
+            with paddle.set_grad_enabled(False):
+                self.assertTrue(paddle.is_grad_enabled() is False)
+                with paddle.set_grad_enabled(True):
+                    self.assertTrue(paddle.is_grad_enabled())
+
+    def func_sum_op(self):
         x = np.ones([2, 2], np.float32)
         with fluid.dygraph.guard():
             inputs = []
             for _ in range(10):
-                tmp = fluid.dygraph.base.to_variable(x)
+                tmp = paddle.to_tensor(x)
                 tmp.stop_gradient = False
                 inputs.append(tmp)
-            ret = fluid.layers.sums(inputs)
+            ret = paddle.add_n(inputs)
             loss = fluid.layers.reduce_sum(ret)
             loss.backward()
         with fluid.dygraph.guard():
             inputs2 = []
             for _ in range(10):
-                tmp = fluid.dygraph.base.to_variable(x)
+                tmp = paddle.to_tensor(x)
                 tmp.stop_gradient = False
                 inputs2.append(tmp)
-            ret2 = fluid.layers.sums(inputs2)
+            ret2 = paddle.add_n(inputs2)
             loss2 = fluid.layers.reduce_sum(ret2)
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
             loss2.backward()
@@ -345,12 +349,22 @@ def test_sum_op(self):
             a = inputs2[0].gradient()
             self.assertTrue(np.allclose(inputs2[0].gradient(), x))
 
-    def test_empty_var(self):
+    def test_sum_op(self):
+        with _test_eager_guard():
+            self.func_sum_op()
+        self.func_sum_op()
+
+    def func_empty_var(self):
         with fluid.dygraph.guard():
             cur_program = fluid.Program()
             cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(
-                name="X", shape=[-1, 23, 48], dtype='float32')
+            # Normally, we don't allow tensor with -1 shape being created in dygraph mode, this test is not good.
+            if not _in_eager_mode():
+                new_variable = cur_block.create_var(
+                    name="X", shape=[-1, 23, 48], dtype='float32')
+            else:
+                new_variable = cur_block.create_var(
+                    name="X", shape=[1, 23, 48], dtype='float32')
             try:
                 new_variable.numpy()
             except Exception as e:
@@ -360,53 +374,77 @@ def test_empty_var(self):
                 new_variable.backward()
             except Exception as e:
                 assert type(e) == core.EnforceNotMet
+            # TODO(jiabin): Support clear_gradient in eager mode later and remove this if statement
+            if not _in_eager_mode():
+                try:
+                    new_variable.clear_gradient()
+                except Exception as e:
+                    assert type(e) == core.EnforceNotMet
 
-            try:
-                new_variable.clear_gradient()
-            except Exception as e:
-                assert type(e) == core.EnforceNotMet
+    def test_empty_var(self):
+        with _test_eager_guard():
+            self.func_empty_var()
+        self.func_empty_var()
 
-    def test_empty_grad(self):
+    def func_empty_grad(self):
         with fluid.dygraph.guard():
             x = np.ones([2, 2], np.float32)
-            new_var = fluid.dygraph.base.to_variable(x)
-            try:
-                new_var.gradient()
-            except Exception as e:
-                assert type(e) == ValueError
-
-            try:
-                new_var.clear_gradient()
-            except Exception as e:
-                assert type(e) == core.EnforceNotMet
+            new_var = paddle.to_tensor(x)
+            self.assertIsNone(new_var.gradient())
+            # TODO(jiabin): Support clear_gradient in eager mode later and remove this if statement
+            if not _in_eager_mode():
+                try:
+                    new_var.clear_gradient()
+                except Exception as e:
+                    assert type(e) == core.EnforceNotMet
 
         with fluid.dygraph.guard():
             cur_program = fluid.Program()
             cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(
-                name="X", shape=[-1, 23, 48], dtype='float32')
+            # Normally, we don't allow tensor with -1 shape being created in dygraph mode, this test is not good.
+            if not _in_eager_mode():
+                new_variable = cur_block.create_var(
+                    name="X", shape=[-1, 23, 48], dtype='float32')
+            else:
+                new_variable = cur_block.create_var(
+                    name="X", shape=[1, 23, 48], dtype='float32')
             try:
                 new_variable.gradient()
             except Exception as e:
                 assert type(e) == ValueError
 
-    def test_set_persistable(self):
+    def test_empty_grad(self):
+        with _test_eager_guard():
+            self.func_empty_grad()
+        self.func_empty_grad()
+
+    def func_set_persistable(self):
         with fluid.dygraph.guard():
             x = np.ones([2, 2], np.float32)
-            new_var = fluid.dygraph.base.to_variable(x)
+            new_var = paddle.to_tensor(x)
             self.assertFalse(new_var.persistable)
             new_var.persistable = True
             self.assertTrue(new_var.persistable)
 
-    def test_layer(self):
+    def test_set_persistable(self):
+        with _test_eager_guard():
+            self.func_set_persistable()
+        self.func_set_persistable()
+
+    def func_layer(self):
         with fluid.dygraph.guard():
             l = fluid.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
-    def test_layer_in_out(self):
+    def test_layer(self):
+        with _test_eager_guard():
+            self.func_layer()
+        self.func_layer()
+
+    def func_layer_in_out(self):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.dygraph.guard():
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
+            var_inp = paddle.to_tensor(np_inp)
             var_inp.stop_gradient = False
             l = MyLayer()
             x = l(var_inp)[0]
@@ -416,7 +454,7 @@ def test_layer_in_out(self):
             dy_grad = l._x_for_debug.gradient()
 
         with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
+            var_inp2 = paddle.to_tensor(np_inp)
             var_inp2.stop_gradient = False
             l2 = MyLayer()
             x2 = l2(var_inp2)[0]
@@ -440,15 +478,20 @@ def test_layer_in_out(self):
                 feed={inp.name: np_inp},
                 fetch_list=[x.name, param_grads[1].name])
 
-        self.assertTrue(np.allclose(dy_out, static_out))
-        self.assertTrue(np.allclose(dy_grad, static_grad))
-        self.assertTrue(np.allclose(dy_out2, static_out))
-        self.assertTrue(np.allclose(dy_grad2, static_grad))
+        self.assertTrue(np.array_equal(dy_out, static_out))
+        self.assertTrue(np.array_equal(dy_grad, static_grad))
+        self.assertTrue(np.array_equal(dy_out2, static_out))
+        self.assertTrue(np.array_equal(dy_grad2, static_grad))
 
-    def test_mlp(self):
+    def test_layer_in_out(self):
+        with _test_eager_guard():
+            self.func_layer_in_out()
+        self.func_layer_in_out()
+
+    def func_mlp(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.dygraph.guard():
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
+            var_inp = paddle.to_tensor(np_inp)
             mlp = MLP(input_size=2)
             out = mlp(var_inp)
             dy_out = out.numpy()
@@ -456,7 +499,7 @@ def test_mlp(self):
             dy_grad = mlp._linear1.weight.gradient()
 
         with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
+            var_inp2 = paddle.to_tensor(np_inp)
             mlp2 = MLP(input_size=2)
             out2 = mlp2(var_inp2)
             dy_out2 = out2.numpy()
@@ -496,6 +539,11 @@ def test_mlp(self):
         self.assertEqual(mlp._linear2, sublayers[1])
         self.assertEqual(len(sublayers), 2)
 
+    def test_mlp(self):
+        with _test_eager_guard():
+            self.func_mlp()
+        self.func_mlp()
+
     def test_gradient_accumulation(self):
         def test_single_api(sort_sum_gradient):
             fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
@@ -628,14 +676,14 @@ def test_mlp(sort_sum_gradient):
             test_mlp(False)
             test_mlp(True)
 
-    def test_dygraph_vs_static(self):
+    def func_dygraph_vs_static(self):
         np_inp1 = np.random.rand(4, 3, 3)
         np_inp2 = np.random.rand(4, 3, 3)
 
         # dynamic graph
         with fluid.dygraph.guard():
-            inp1 = fluid.dygraph.to_variable(np_inp1)
-            inp2 = fluid.dygraph.to_variable(np_inp2)
+            inp1 = paddle.to_tensor(np_inp1)
+            inp2 = paddle.to_tensor(np_inp2)
             if np.sum(np_inp1) < np.sum(np_inp2):
                 x = fluid.layers.elementwise_add(inp1, inp2)
             else:
@@ -679,13 +727,18 @@ def test_dygraph_vs_static(self):
                                     fetch_list=out)[0]
         self.assertTrue(np.allclose(dygraph_result, static_result))
 
-    def test_rnn(self):
+    def test_dygraph_vs_static(self):
+        with _test_eager_guard():
+            self.func_dygraph_vs_static()
+        self.func_dygraph_vs_static()
+
+    def func_rnn(self):
         np_inp = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0],
                            [10.0, 11.0, 12.0]])
         np_inp = np_inp.reshape((1, 4, 3))
         np_inp = np_inp.astype(np.float32)
         with fluid.dygraph.guard():
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
+            var_inp = paddle.to_tensor(np_inp)
             var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
             simple_rnn = SimpleRNN()
             outs, pre_hiddens = simple_rnn.forward(var_inp)
@@ -696,7 +749,7 @@ def test_rnn(self):
             dy_grad_i2h = simple_rnn._cell._i2h_w.gradient()
 
         with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
+            var_inp2 = paddle.to_tensor(np_inp)
             var_inp2 = fluid.layers.reshape(var_inp2, shape=[1, 4, 3])
             simple_rnn2 = SimpleRNN()
             outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
@@ -722,16 +775,21 @@ def test_rnn(self):
                     param_grads[1][1].name, param_grads[2][1].name
                 ])
 
-        self.assertTrue(np.allclose(dy_out, static_out))
-        self.assertTrue(np.allclose(dy_grad_h2o, static_grad_h2o))
-        self.assertTrue(np.allclose(dy_grad_h2h, static_grad_h2h))
-        self.assertTrue(np.allclose(dy_grad_i2h, static_grad_i2h))
-        self.assertTrue(np.allclose(dy_out2, static_out))
-        self.assertTrue(np.allclose(dy_grad_h2o2, static_grad_h2o))
-        self.assertTrue(np.allclose(dy_grad_h2h2, static_grad_h2h))
-        self.assertTrue(np.allclose(dy_grad_i2h2, static_grad_i2h))
+        self.assertTrue(np.array_equal(dy_out, static_out))
+        self.assertTrue(np.array_equal(dy_grad_h2o, static_grad_h2o))
+        self.assertTrue(np.array_equal(dy_grad_h2h, static_grad_h2h))
+        self.assertTrue(np.array_equal(dy_grad_i2h, static_grad_i2h))
+        self.assertTrue(np.array_equal(dy_out2, static_out))
+        self.assertTrue(np.array_equal(dy_grad_h2o2, static_grad_h2o))
+        self.assertTrue(np.array_equal(dy_grad_h2h2, static_grad_h2h))
+        self.assertTrue(np.array_equal(dy_grad_i2h2, static_grad_i2h))
 
-    def test_layer_attrs(self):
+    def test_rnn(self):
+        with _test_eager_guard():
+            self.func_rnn()
+        self.func_rnn()
+
+    def func_layer_attrs(self):
         layer = fluid.dygraph.Layer("test")
         layer.test_attr = 1
         self.assertFalse(hasattr(layer, "whatever"))
@@ -751,60 +809,90 @@ def test_layer_attrs(self):
         my_layer.l1 = None
         self.assertEqual(len(my_layer.sublayers()), 0)
 
+    def test_layer_attrs(self):
+        with _test_eager_guard():
+            self.func_layer_attrs()
+        self.func_layer_attrs()
+
 
 class TestDygraphUtils(unittest.TestCase):
-    def test_append_activation_in_dygraph_exception(self):
+    def func_append_activation_in_dygraph_exception(self):
         with new_program_scope():
             np_inp = np.random.random(size=(10, 20, 30)).astype(np.float32)
             a = fluid.layers.data("a", [10, 20])
             func = dygraph_utils._append_activation_in_dygraph
             self.assertRaises(AssertionError, func, a, act="sigmoid")
 
-    def test_append_activation_in_dygraph1(self):
+    def test_append_activation_in_dygraph_exception(self):
+        with _test_eager_guard():
+            self.func_append_activation_in_dygraph_exception()
+        self.func_append_activation_in_dygraph_exception()
+
+    def func_append_activation_in_dygraph1(self):
         a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
         func = dygraph_utils._append_activation_in_dygraph
         with fluid.dygraph.guard():
-            a = fluid.dygraph.to_variable(a_np)
+            a = paddle.to_tensor(a_np)
             res1 = func(a, act="hard_sigmoid")
             res2 = fluid.layers.hard_sigmoid(a)
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
-    def test_append_activation_in_dygraph2(self):
+    def test_append_activation_in_dygraph1(self):
+        with _test_eager_guard():
+            self.func_append_activation_in_dygraph1()
+        self.func_append_activation_in_dygraph1()
+
+    def func_append_activation_in_dygraph2(self):
         a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
         func = dygraph_utils._append_activation_in_dygraph
         with fluid.dygraph.guard():
-            a = fluid.dygraph.to_variable(a_np)
+            a = paddle.to_tensor(a_np)
             res1 = func(a, act="sigmoid", use_mkldnn=True, use_cudnn=True)
             res2 = fluid.layers.sigmoid(a)
             self.assertTrue(np.allclose(res1.numpy(), res2.numpy()))
 
-    def test_append_activation_in_dygraph3(self):
+    def test_append_activation_in_dygraph2(self):
+        with _test_eager_guard():
+            self.func_append_activation_in_dygraph2()
+        self.func_append_activation_in_dygraph2()
+
+    def func_append_activation_in_dygraph3(self):
         a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
         helper = LayerObjectHelper(fluid.unique_name.generate("test"))
         func = helper.append_activation
         with fluid.dygraph.guard():
-            a = fluid.dygraph.to_variable(a_np)
+            a = paddle.to_tensor(a_np)
             res1 = func(a, act="sigmoid", use_cudnn=True)
             res2 = fluid.layers.sigmoid(a)
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
-    def test_append_activation_in_dygraph_use_mkldnn(self):
+    def test_append_activation_in_dygraph3(self):
+        with _test_eager_guard():
+            self.func_append_activation_in_dygraph3()
+        self.func_append_activation_in_dygraph3()
+
+    def func_append_activation_in_dygraph_use_mkldnn(self):
         a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
         helper = LayerHelper(
             fluid.unique_name.generate("test"), act="relu", use_mkldnn=True)
         func = helper.append_activation
         with fluid.dygraph.guard():
-            a = fluid.dygraph.to_variable(a_np)
+            a = paddle.to_tensor(a_np)
             res1 = func(a)
             res2 = fluid.layers.relu(a)
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
-    def test_append_activation_in_dygraph_global_use_mkldnn(self):
+    def test_append_activation_in_dygraph_use_mkldnn(self):
+        with _test_eager_guard():
+            self.func_append_activation_in_dygraph_use_mkldnn()
+        self.func_append_activation_in_dygraph_use_mkldnn()
+
+    def func_append_activation_in_dygraph_global_use_mkldnn(self):
         a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
         helper = LayerHelper(fluid.unique_name.generate("test"), act="relu")
         func = helper.append_activation
         with fluid.dygraph.guard(fluid.core.CPUPlace()):
-            a = fluid.dygraph.to_variable(a_np)
+            a = paddle.to_tensor(a_np)
             fluid.set_flags({'FLAGS_use_mkldnn': True})
             try:
                 res1 = func(a)
@@ -813,38 +901,67 @@ def test_append_activation_in_dygraph_global_use_mkldnn(self):
             res2 = fluid.layers.relu(a)
         self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
-    def test_append_bias_in_dygraph_exception(self):
+    def test_append_activation_in_dygraph_global_use_mkldnn(self):
+        with _test_eager_guard():
+            self.func_append_activation_in_dygraph_global_use_mkldnn()
+        self.func_append_activation_in_dygraph_global_use_mkldnn()
+
+    def func_append_bias_in_dygraph_exception(self):
         with new_program_scope():
             np_inp = np.random.random(size=(10, 20, 30)).astype(np.float32)
             a = fluid.layers.data("a", [10, 20])
             func = dygraph_utils._append_bias_in_dygraph
             self.assertRaises(AssertionError, func, a)
 
-    def test_append_bias_in_dygraph(self):
+    def test_append_bias_in_dygraph_exception(self):
+        with _test_eager_guard():
+            self.func_append_bias_in_dygraph_exception()
+        self.func_append_bias_in_dygraph_exception()
+
+    def func_append_bias_in_dygraph(self):
         a_np = np.random.random(size=(10, 20, 30)).astype(np.float32)
         func = dygraph_utils._append_bias_in_dygraph
         with fluid.dygraph.guard():
-            a = fluid.dygraph.to_variable(a_np)
+            a = paddle.to_tensor(a_np)
             res1 = func(a, bias=a)
-            res2 = a + a
+            res2 = paddle.add(a, a)
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
+    def test_append_bias_in_dygraph(self):
+        with _test_eager_guard():
+            self.func_append_bias_in_dygraph()
+        self.func_append_bias_in_dygraph()
+
 
 class TestDygraphGuardWithError(unittest.TestCase):
-    def test_without_guard(self):
+    def func_without_guard(self):
         with fluid.dygraph.guard():
-            x = fluid.dygraph.to_variable(np.zeros([10, 10]))
+            x = paddle.to_tensor(np.zeros([10, 10]))
         with self.assertRaisesRegexp(TypeError,
                                      "Please use `with fluid.dygraph.guard()"):
             y = fluid.layers.matmul(x, x)
 
+    def test_without_guard(self):
+        with _test_eager_guard():
+            self.func_without_guard()
+        self.func_without_guard()
+
 
 class TestMetaclass(unittest.TestCase):
-    def test_metaclass(self):
+    def func_metaclass(self):
         self.assertEqual(type(MyLayer).__name__, 'type')
         self.assertNotEqual(type(MyLayer).__name__, 'pybind11_type')
-        self.assertEqual(
-            type(paddle.fluid.core.VarBase).__name__, 'pybind11_type')
+        if core._in_eager_mode():
+            self.assertEqual(
+                type(paddle.fluid.core.eager.EagerTensor).__name__, 'type')
+        else:
+            self.assertEqual(
+                type(paddle.fluid.core.VarBase).__name__, 'pybind11_type')
+
+    def test_metaclass(self):
+        with _test_eager_guard():
+            self.func_metaclass()
+        self.func_metaclass()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
index 972f1b64e1407..dcf4e8de5e441 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
@@ -17,10 +17,11 @@
 import unittest
 import paddle.fluid as fluid
 import numpy as np
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestImperativeContainerSequential(unittest.TestCase):
-    def test_sequential(self):
+    def func_sequential(self):
         data = np.random.uniform(-1, 1, [5, 10]).astype('float32')
         with fluid.dygraph.guard():
             data = fluid.dygraph.to_variable(data)
@@ -55,7 +56,12 @@ def test_sequential(self):
             loss2 = fluid.layers.reduce_mean(res2)
             loss2.backward()
 
-    def test_sequential_list_params(self):
+    def test_sequential(self):
+        with _test_eager_guard():
+            self.func_sequential()
+        self.func_sequential()
+
+    def func_sequential_list_params(self):
         data = np.random.uniform(-1, 1, [5, 10]).astype('float32')
         with fluid.dygraph.guard():
             data = fluid.dygraph.to_variable(data)
@@ -90,6 +96,11 @@ def test_sequential_list_params(self):
             loss2 = fluid.layers.reduce_mean(res2)
             loss2.backward()
 
+    def test_sequential_list_params(self):
+        with _test_eager_guard():
+            self.func_sequential_list_params()
+        self.func_sequential_list_params()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
index b388efc5f3e01..8aadb155b0c0a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
@@ -21,6 +21,7 @@
 
 import paddle.compat as cpt
 from paddle.fluid import core
+from paddle.fluid.framework import _test_eager_guard
 
 
 def set_child_signal_handler(self, child_pid):
@@ -37,8 +38,8 @@ def __handler__(signum, frame):
     signal.signal(signal.SIGCHLD, __handler__)
 
 
-class TestDygraphDataLoaderSingalHandler(unittest.TestCase):
-    def test_child_process_exit_with_error(self):
+class DygraphDataLoaderSingalHandler(unittest.TestCase):
+    def func_child_process_exit_with_error(self):
         def __test_process__():
             core._set_process_signal_handler()
             sys.exit(1)
@@ -65,7 +66,12 @@ def try_except_exit():
 
         self.assertIsNotNone(exception)
 
-    def test_child_process_killed_by_sigsegv(self):
+    def test_child_process_exit_with_error(self):
+        with _test_eager_guard():
+            self.func_child_process_exit_with_error()
+        self.func_child_process_exit_with_error()
+
+    def func_child_process_killed_by_sigsegv(self):
         def __test_process__():
             core._set_process_signal_handler()
             os.kill(os.getpid(), signal.SIGSEGV)
@@ -93,7 +99,12 @@ def try_except_exit():
 
         self.assertIsNotNone(exception)
 
-    def test_child_process_killed_by_sigbus(self):
+    def test_child_process_killed_by_sigsegv(self):
+        with _test_eager_guard():
+            self.func_child_process_killed_by_sigsegv()
+        self.func_child_process_killed_by_sigsegv()
+
+    def func_child_process_killed_by_sigbus(self):
         def __test_process__():
             core._set_process_signal_handler()
             os.kill(os.getpid(), signal.SIGBUS)
@@ -120,7 +131,12 @@ def try_except_exit():
 
         self.assertIsNotNone(exception)
 
-    def test_child_process_killed_by_sigterm(self):
+    def test_child_process_killed_by_sigbus(self):
+        with _test_eager_guard():
+            self.func_child_process_killed_by_sigbus()
+        self.func_child_process_killed_by_sigbus()
+
+    def func_child_process_killed_by_sigterm(self):
         def __test_process__():
             core._set_process_signal_handler()
             time.sleep(10)
@@ -132,6 +148,11 @@ def __test_process__():
         set_child_signal_handler(id(self), test_process.pid)
         time.sleep(1)
 
+    def test_child_process_killed_by_sigterm(self):
+        with _test_eager_guard():
+            self.func_child_process_killed_by_sigterm()
+        self.func_child_process_killed_by_sigterm()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index e114961c0cc9a..33f304ef33d67 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -16,6 +16,7 @@
 import paddle.fluid as fluid
 import numpy as np
 import unittest
+from paddle import _C_ops
 
 if fluid.is_compiled_with_cuda():
     fluid.core.globals()['FLAGS_cudnn_deterministic'] = True
@@ -112,8 +113,8 @@ def __init__(self, num_channels, epsilon=1e-5):
 
     def forward(self, input):
         if fluid.in_dygraph_mode():
-            out, _, _ = fluid.core.ops.instance_norm(
-                input, self.scale, self.bias, 'epsilon', self.epsilon)
+            out, _, _ = _C_ops.instance_norm(input, self.scale, self.bias,
+                                             'epsilon', self.epsilon)
             return out
         else:
             return fluid.layers.instance_norm(
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 6fdad811ee885..bff10c9c4ca26 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -65,7 +65,7 @@ def test_constant_initializer_default_value(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer())
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -84,7 +84,7 @@ def test_constant_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.ConstantInitializer(2.3))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -94,10 +94,8 @@ def test_constant_initializer(self, dtype="float32"):
     def test_constant_initializer_fp16(self):
         """Test constant initializer with float16
         """
-        block = self.test_constant_initializer_default_value("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
-        block = self.test_constant_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_constant_initializer_default_value("float16")
+        self.test_constant_initializer("float16")
 
     def test_constant_initializer_bf16(self):
         """Test constant initializer with bfloat16
@@ -246,7 +244,7 @@ def test_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NormalInitializer(2.3, 1.9, 123))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -258,14 +256,12 @@ def test_normal_initializer(self, dtype="float32"):
     def test_normal_initializer_fp16(self):
         """Test normal initializer with float16
         """
-        block = self.test_normal_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_normal_initializer("float16")
 
     def test_normal_initializer_bf16(self):
         """Test normal initializer with bfloat16
         """
-        block = self.test_normal_initializer("uint16")
-        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_normal_initializer("uint16")
 
 
 class TestXavierInitializer(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index 85815c5eeef30..74686652044ec 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -54,7 +54,7 @@ def static_test_constant_initializer_common(self,
                 lod_level=0,
                 name="param",
                 initializer=init_inst)
-        num_ops = 2 if dtype in ["float16"] else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -103,9 +103,7 @@ def test_constant_initializer_fp16(self):
         """Test constant initializer with float16
         """
         block = self.test_constant_initializer_default_value_static("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
         block = self.test_constant_initializer_static("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
         self.test_constant_initializer_default_value_dygraph("float16")
         self.test_constant_initializer_dygraph("float16")
 
@@ -402,7 +400,7 @@ def test_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Normal(2.3, 1.9))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -417,13 +415,11 @@ def test_normal_initializer_fp16(self):
         """Test normal initializer with float16
         """
         block = self.test_normal_initializer("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
 
     def test_normal_initializer_bf16(self):
         """Test normal initializer with bfloat16
         """
         block = self.test_normal_initializer("uint16")  #bfloat16
-        self.assertTrue(check_cast_op(block.ops[1]))
 
     def test_normal_initializer_dygraph(self):
         """Test normal initializer in dygraph model.
diff --git a/python/paddle/fluid/tests/unittests/test_inner.py b/python/paddle/fluid/tests/unittests/test_inner.py
new file mode 100644
index 0000000000000..de9decd0b8961
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inner.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.static import Program, program_guard
+
+
+class TestMultiplyApi(unittest.TestCase):
+    def _run_static_graph_case(self, x_data, y_data):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+            x = paddle.static.data(
+                name='x', shape=x_data.shape, dtype=x_data.dtype)
+            y = paddle.static.data(
+                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            res = paddle.inner(x, y)
+
+            place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            outs = exe.run(paddle.static.default_main_program(),
+                           feed={'x': x_data,
+                                 'y': y_data},
+                           fetch_list=[res])
+            res = outs[0]
+            return res
+
+    def _run_dynamic_graph_case(self, x_data, y_data):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        res = paddle.inner(x, y)
+        return res.numpy()
+
+    def test_multiply(self):
+        np.random.seed(7)
+
+        # test static computation graph: 3-d array
+        x_data = np.random.rand(2, 10, 10).astype(np.float64)
+        y_data = np.random.rand(2, 5, 10).astype(np.float64)
+        res = self._run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
+
+        # test static computation graph: 2-d array
+        x_data = np.random.rand(200, 5).astype(np.float64)
+        y_data = np.random.rand(50, 5).astype(np.float64)
+        res = self._run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
+
+        # test static computation graph: 1-d array
+        x_data = np.random.rand(50).astype(np.float64)
+        y_data = np.random.rand(50).astype(np.float64)
+        res = self._run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
+
+        # test dynamic computation graph: 3-d array
+        x_data = np.random.rand(5, 10, 10).astype(np.float64)
+        y_data = np.random.rand(2, 10).astype(np.float64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
+
+        # test dynamic computation graph: 2-d array
+        x_data = np.random.rand(20, 50).astype(np.float64)
+        y_data = np.random.rand(50).astype(np.float64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
+
+        # test dynamic computation graph: Scalar
+        x_data = np.random.rand(20, 10).astype(np.float32)
+        y_data = np.random.rand(1).astype(np.float32).item()
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
+
+        # test dynamic computation graph: 2-d array Complex
+        x_data = np.random.rand(20,
+                                50).astype(np.float64) + 1J * np.random.rand(
+                                    20, 50).astype(np.float64)
+        y_data = np.random.rand(50).astype(np.float64) + 1J * np.random.rand(
+            50).astype(np.float64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
+
+        # test dynamic computation graph: 3-d array Complex
+        x_data = np.random.rand(5, 10,
+                                10).astype(np.float64) + 1J * np.random.rand(
+                                    5, 10, 10).astype(np.float64)
+        y_data = np.random.rand(2, 10).astype(np.float64) + 1J * np.random.rand(
+            2, 10).astype(np.float64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.inner(x_data, y_data)))
+
+
+class TestMultiplyError(unittest.TestCase):
+    def test_errors(self):
+        # test static computation graph: dtype can not be int8
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name='x', shape=[100], dtype=np.int8)
+            y = paddle.static.data(name='y', shape=[100], dtype=np.int8)
+            self.assertRaises(TypeError, paddle.inner, x, y)
+
+        # test static computation graph: inputs must be broadcastable 
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name='x', shape=[20, 50], dtype=np.float64)
+            y = paddle.static.data(name='y', shape=[20], dtype=np.float64)
+            self.assertRaises(ValueError, paddle.inner, x, y)
+
+        np.random.seed(7)
+        # test dynamic computation graph: dtype can not be int8
+        paddle.disable_static()
+        x_data = np.random.randn(200).astype(np.int8)
+        y_data = np.random.randn(200).astype(np.int8)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(RuntimeError, paddle.inner, x, y)
+
+        # test dynamic computation graph: inputs must be broadcastable
+        x_data = np.random.rand(20, 5)
+        y_data = np.random.rand(10, 2)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(ValueError, paddle.inner, x, y)
+
+        # test dynamic computation graph: dtype must be same	
+        x_data = np.random.randn(200).astype(np.float32)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(ValueError, paddle.inner, x, y)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float64)
+        y_data = np.random.randn(200).astype(np.float64)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(ValueError, paddle.inner, x_data, y)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float64)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        self.assertRaises(ValueError, paddle.inner, x, y_data)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float32)
+        y_data = np.random.randn(200).astype(np.float32)
+        self.assertRaises(ValueError, paddle.inner, x_data, y_data)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace.py b/python/paddle/fluid/tests/unittests/test_inplace.py
index 98e2d2367fd5e..316db18753511 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace.py
@@ -434,5 +434,18 @@ def test_loss_is_inplace_var(self):
         self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a))
 
 
+class TestContinuouslyInplace(unittest.TestCase):
+    def test_continuously_inplace(self):
+        a = paddle.rand([2, 3])
+        a.stop_gradient = False
+        b = a * 2
+
+        b.reshape_([-1])
+        b.reshape_([2, 3])
+        b.reshape_([-1])
+
+        b.backward()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_kthvalue_op.py b/python/paddle/fluid/tests/unittests/test_kthvalue_op.py
new file mode 100644
index 0000000000000..68dd58835c56c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_kthvalue_op.py
@@ -0,0 +1,194 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+def cal_kthvalue(x, k, axis, keepdim=False):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    indices = np.argsort(x, axis=axis)
+    value = np.sort(x, axis=axis)
+    indices = indices.take(indices=k - 1, axis=axis)
+    value = value.take(indices=k - 1, axis=axis)
+    if keepdim:
+        indices = np.expand_dims(indices, axis)
+        value = np.expand_dims(value, axis)
+    return value, indices
+
+
+class TestKthvalueOp(OpTest):
+    def init_args(self):
+        self.k = 5
+        self.axis = -1
+
+    def setUp(self):
+        self.op_type = "kthvalue"
+        self.dtype = np.float64
+        self.input_data = np.random.random((2, 1, 2, 4, 10))
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis}
+        output, indices = cal_kthvalue(
+            self.input_data, k=self.k, axis=self.axis)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestKthvalueOpWithKeepdim(OpTest):
+    def init_args(self):
+        self.k = 2
+        self.axis = 1
+
+    def setUp(self):
+        self.init_args()
+        self.op_type = "kthvalue"
+        self.dtype = np.float64
+        self.input_data = np.random.random((1, 3, 2, 4, 10))
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'keepdim': True}
+        output, indices = cal_kthvalue(
+            self.input_data, k=self.k, axis=self.axis, keepdim=True)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestKthvalueOpKernels(unittest.TestCase):
+    def setUp(self):
+        self.axises = [2, -1]
+
+    def test_kthvalue_op(self):
+        paddle.disable_static()
+
+        def test_cpu_kernel():
+            shape = (2, 128, 10)
+            k = 2
+            paddle.set_device('cpu')
+            inputs = np.random.random(shape)
+            tensor = paddle.to_tensor(inputs)
+            for axis in self.axises:
+                value_expect, indice_expect = cal_kthvalue(inputs, k, axis)
+                v, inds = paddle.kthvalue(tensor, k, axis)
+                self.assertTrue(np.allclose(v.numpy(), value_expect))
+                self.assertTrue(np.allclose(inds.numpy(), indice_expect))
+
+        def test_gpu_kernel():
+            shape = (2, 30, 250)
+            k = 244
+            paddle.set_device('gpu')
+            inputs = np.random.random(shape)
+            tensor = paddle.to_tensor(inputs)
+            for axis in self.axises:
+                value_expect, indice_expect = cal_kthvalue(inputs, k, axis)
+                v, inds = paddle.kthvalue(tensor, k, axis)
+                self.assertTrue(np.allclose(v.numpy(), value_expect))
+                self.assertTrue(np.allclose(inds.numpy(), indice_expect))
+
+        test_cpu_kernel()
+        if fluid.core.is_compiled_with_cuda():
+            test_gpu_kernel()
+
+
+class TestKthvalueOpWithNaN(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.x = paddle.uniform([2, 200, 10], dtype='float32')
+
+    def test_errors(self):
+        def test_nan_in_cpu_kernel():
+            paddle.set_device('cpu')
+            nan_position = 100
+            self.x[0, nan_position, 2] = float('nan')
+            v, inds = self.x.kthvalue(k=200, axis=1)
+            self.assertTrue(np.isnan(v[0, 2].numpy()[0]))
+            self.assertEqual(inds[0, 2].numpy()[0], nan_position)
+
+        def test_nan_in_gpu_kernel():
+            paddle.set_device('gpu')
+            nan_position = 100
+            self.x[0, nan_position, 2] = float('nan')
+            v, inds = self.x.kthvalue(k=200, axis=1)
+            self.assertTrue(np.isnan(v[0, 2].numpy()[0]))
+            self.assertEqual(inds[0, 2].numpy()[0], nan_position)
+
+        test_nan_in_cpu_kernel()
+        if fluid.core.is_compiled_with_cuda():
+            test_nan_in_gpu_kernel()
+
+
+class TestKthvalueOpErrors(unittest.TestCase):
+    def setUp(self):
+        self.x = paddle.uniform([2, 10, 20, 25], dtype='float32')
+
+    def test_errors(self):
+        paddle.disable_static()
+
+        def test_k_lowrange_error():
+            self.x.kthvalue(k=0, axis=2)
+
+        self.assertRaises(ValueError, test_k_lowrange_error)
+
+        def test_k_uprange_error():
+            self.x.kthvalue(k=500, axis=2)
+
+        self.assertRaises(ValueError, test_k_uprange_error)
+
+        def test_dim_range_error():
+            self.x.kthvalue(k=10, axis=5)
+
+        self.assertRaises(ValueError, test_dim_range_error)
+
+
+class TestModeOpInStatic(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(666)
+        self.input_data = np.random.random((2, 20, 1, 2, 80)).astype(np.float64)
+        self.k = 10
+
+    def test_run_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[2, 20, 1, 2, 80], dtype="float64")
+            result = paddle.kthvalue(input_tensor, self.k, axis=1)
+            expect_value = cal_kthvalue(self.input_data, self.k, axis=1)[0]
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            paddle_result = exe.run(feed={"x": self.input_data},
+                                    fetch_list=[result])[0]
+            self.assertTrue(np.allclose(paddle_result, expect_value))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
new file mode 100644
index 0000000000000..4c0325a35f32e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
@@ -0,0 +1,212 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+
+
+class LinalgLstsqTestCase(unittest.TestCase):
+    def setUp(self):
+        self.init_config()
+        self.generate_input()
+        self.generate_output()
+
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.driver = "gelsd"
+        self._input_shape_1 = (5, 4)
+        self._input_shape_2 = (5, 3)
+
+    def generate_input(self):
+        self._input_data_1 = np.random.random(self._input_shape_1).astype(
+            self.dtype)
+        self._input_data_2 = np.random.random(self._input_shape_2).astype(
+            self.dtype)
+
+    def generate_output(self):
+        if len(self._input_shape_1) == 2:
+            out = np.linalg.lstsq(
+                self._input_data_1, self._input_data_2, rcond=self.rcond)
+        elif len(self._input_shape_1) == 3:
+            out = np.linalg.lstsq(
+                self._input_data_1[0], self._input_data_2[0], rcond=self.rcond)
+
+        self._output_solution = out[0]
+        self._output_residuals = out[1]
+        self._output_rank = out[2]
+        self._output_sg_values = out[3]
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        paddle.device.set_device("cpu")
+        place = paddle.CPUPlace()
+        x = paddle.to_tensor(self._input_data_1, place=place, dtype=self.dtype)
+        y = paddle.to_tensor(self._input_data_2, place=place, dtype=self.dtype)
+        results = paddle.linalg.lstsq(
+            x, y, rcond=self.rcond, driver=self.driver)
+
+        res_solution = results[0].numpy()
+        res_residuals = results[1].numpy()
+        res_rank = results[2].numpy()
+        res_singular_values = results[3].numpy()
+
+        if x.shape[-2] > x.shape[-1] and self._output_rank == x.shape[-1]:
+            if (np.abs(res_residuals - self._output_residuals) < 1e-6).any():
+                pass
+            else:
+                raise RuntimeError("Check LSTSQ residuals dygraph Failed")
+
+        if self.driver in ("gelsy", "gelsd", "gelss"):
+            if (np.abs(res_rank - self._output_rank) < 1e-6).any():
+                pass
+            else:
+                raise RuntimeError("Check LSTSQ rank dygraph Failed")
+
+        if self.driver in ("gelsd", "gelss"):
+            if (np.abs(res_singular_values - self._output_sg_values) < 1e-6
+                ).any():
+                pass
+            else:
+                raise RuntimeError("Check LSTSQ singular values dygraph Failed")
+
+    def test_static(self):
+        paddle.enable_static()
+        paddle.device.set_device("cpu")
+        place = fluid.CPUPlace()
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = paddle.fluid.data(
+                name="x",
+                shape=self._input_shape_1,
+                dtype=self._input_data_1.dtype)
+            y = paddle.fluid.data(
+                name="y",
+                shape=self._input_shape_2,
+                dtype=self._input_data_2.dtype)
+            results = paddle.linalg.lstsq(
+                x, y, rcond=self.rcond, driver=self.driver)
+            exe = fluid.Executor(place)
+            fetches = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self._input_data_1,
+                      "y": self._input_data_2},
+                fetch_list=[results])
+
+            if x.shape[-2] > x.shape[-1] and self._output_rank == x.shape[-1]:
+                if (np.abs(fetches[1] - self._output_residuals) < 1e-6).any():
+                    pass
+                else:
+                    raise RuntimeError("Check LSTSQ residuals static Failed")
+
+            if self.driver in ("gelsy", "gelsd", "gelss"):
+                if (np.abs(fetches[2] - self._output_rank) < 1e-6).any():
+                    pass
+                else:
+                    raise RuntimeError("Check LSTSQ rank static Failed")
+
+            if self.driver in ("gelsd", "gelss"):
+                if (np.abs(fetches[3] - self._output_sg_values) < 1e-6).any():
+                    pass
+                else:
+                    raise RuntimeError(
+                        "Check LSTSQ singular values static Failed")
+
+
+class LinalgLstsqTestCase(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.driver = "gels"
+        self._input_shape_1 = (5, 10)
+        self._input_shape_2 = (5, 5)
+
+
+class LinalgLstsqTestCaseRcond(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 0.1
+        self.driver = "gels"
+        self._input_shape_1 = (3, 2)
+        self._input_shape_2 = (3, 3)
+
+
+class LinalgLstsqTestCaseGelsFloat32(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = 1e-15
+        self.driver = "gels"
+        self._input_shape_1 = (10, 5)
+        self._input_shape_2 = (10, 2)
+
+
+class LinalgLstsqTestCaseGelssFloat64(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.driver = "gelss"
+        self._input_shape_1 = (5, 5)
+        self._input_shape_2 = (5, 1)
+
+
+class LinalgLstsqTestCaseGelsyFloat32(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = 1e-15
+        self.driver = "gelsy"
+        self._input_shape_1 = (8, 2)
+        self._input_shape_2 = (8, 10)
+
+
+class LinalgLstsqTestCaseBatch1(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = 1e-15
+        self.driver = None
+        self._input_shape_1 = (2, 3, 10)
+        self._input_shape_2 = (2, 3, 4)
+
+
+class LinalgLstsqTestCaseBatch2(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.driver = "gelss"
+        self._input_shape_1 = (2, 8, 6)
+        self._input_shape_2 = (2, 8, 2)
+
+
+class LinalgLstsqTestCaseLarge1(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.driver = "gelsd"
+        self._input_shape_1 = (200, 100)
+        self._input_shape_2 = (200, 50)
+
+
+class LinalgLstsqTestCaseLarge2(LinalgLstsqTestCase):
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = 1e-15
+        self.driver = "gelss"
+        self._input_shape_1 = (50, 600)
+        self._input_shape_2 = (50, 300)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 04a0d47e47c86..d62a633c28576 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -205,6 +205,13 @@ def lambda_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
     return learning_rate * lr_lambda(epoch_num)
 
 
+def multiplicative_lr(epoch_num, learning_rate, lr_lambda, verbose=False):
+    latest_lr = learning_rate
+    for i in range(epoch_num):
+        latest_lr = latest_lr * lr_lambda(i + 1)
+    return latest_lr
+
+
 def piecewise_lr(epoch_num, boundaries, values, verbose=False):
     assert len(boundaries) + 1 == len(values)
     for i in range(len(boundaries)):
@@ -519,6 +526,10 @@ def test_scheduler(self):
             "learning_rate": 0.5,
             "lr_lambda": lambda x: 0.95**x,
             "verbose": True
+        }), (multiplicative_lr, paddle.optimizer.lr.MultiplicativeDecay, {
+            "learning_rate": 0.5,
+            "lr_lambda": lambda x: 0.95,
+            "verbose": True
         }), (cosine_annealing_lr, paddle.optimizer.lr.CosineAnnealingDecay, {
             "learning_rate": 0.5,
             "T_max": 10,
diff --git a/python/paddle/fluid/tests/unittests/test_lu_op.py b/python/paddle/fluid/tests/unittests/test_lu_op.py
new file mode 100644
index 0000000000000..1f1e3d1a2fb02
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lu_op.py
@@ -0,0 +1,285 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from op_test import OpTest
+import unittest
+import itertools
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+import scipy
+import scipy.linalg
+import copy
+
+
+def scipy_lu(A, pivot):
+    shape = A.shape
+    if len(shape) == 2:
+        return scipy.linalg.lu(A, permute_l=not pivot)
+    else:
+        preshape = shape[:-2]
+        batchsize = np.product(shape) // (shape[-2] * shape[-1])
+        PP = []
+        PL = []
+        PU = []
+        NA = A.reshape((-1, shape[-2], shape[-1]))
+        for b in range(batchsize):
+            P, L, U = scipy.linalg.lu(NA[b], permute_l=not pivot)
+            pshape = P.shape
+            lshape = L.shape
+            ushape = U.shape
+            PP.append(P)
+            PL.append(L)
+            PU.append(U)
+        return np.array(PP).reshape(preshape + pshape), np.array(PL).reshape(
+            preshape + lshape), np.array(PU).reshape(preshape + ushape)
+
+
+def Pmat_to_perm(Pmat_org, cut):
+    Pmat = copy.deepcopy(Pmat_org)
+    shape = Pmat.shape
+    rows = shape[-2]
+    cols = shape[-1]
+    batchsize = max(1, np.product(shape[:-2]))
+    P = Pmat.reshape(batchsize, rows, cols)
+    permmat = []
+    for b in range(batchsize):
+        permlst = []
+        sP = P[b]
+        for c in range(min(rows, cols)):
+            idx = np.argmax(sP[:, c])
+            permlst.append(idx)
+            tmp = copy.deepcopy(sP[c, :])
+            sP[c, :] = sP[idx, :]
+            sP[idx, :] = tmp
+
+        permmat.append(permlst)
+    Pivot = np.array(permmat).reshape(list(shape[:-2]) + [rows, ]) + 1
+    return Pivot[..., :cut]
+
+
+def perm_to_Pmat(perm, dim):
+    pshape = perm.shape
+    bs = int(np.product(perm.shape[:-1]).item())
+    perm = perm.reshape((bs, pshape[-1]))
+    oneslst = []
+    for i in range(bs):
+        idlst = np.arange(dim)
+        perm_item = perm[i, :]
+        for idx, p in enumerate(perm_item - 1):
+            temp = idlst[idx]
+            idlst[idx] = idlst[p]
+            idlst[p] = temp
+
+        ones = paddle.eye(dim)
+        nmat = paddle.scatter(ones, paddle.to_tensor(idlst), ones)
+        oneslst.append(nmat)
+    return np.array(oneslst).reshape(list(pshape[:-1]) + [dim, dim])
+
+
+# m < n
+class TestLUOp(OpTest):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = [3, 10, 12]
+        self.pivot = True
+        self.get_infos = True
+        self.dtype = "float64"
+
+    def set_output(self):
+        X = self.inputs['X']
+        sP, sl, sU = scipy_lu(X, self.pivot)
+        sL = np.tril(sl, -1)
+        ashape = np.array(X.shape)
+        lshape = np.array(sL.shape)
+        ushape = np.array(sU.shape)
+
+        lpad = (len(sL.shape) - 2) * [(0, 0)] + list((
+            (0, (ashape - lshape)[-2]), (0, (ashape - lshape)[-1])))
+        upad = (len(sU.shape) - 2) * [(0, 0)] + list((
+            (0, (ashape - ushape)[-2]), (0, (ashape - ushape)[-1])))
+
+        NsL = np.pad(sL, lpad)
+        NsU = np.pad(sU, upad)
+        NLU = NsL + NsU
+        self.output = NLU
+        self.Pivots = Pmat_to_perm(sP, min(ashape[-2], ashape[-1]))
+        self.Infos = np.zeros(self.x_shape[:-2]) if len(
+            X.shape) > 2 else np.array([0])
+
+    def setUp(self):
+        self.op_type = "lu"
+        self.config()
+
+        self.inputs = {'X': np.random.random(self.x_shape).astype(self.dtype)}
+        self.attrs = {'pivots': self.pivot}
+        self.set_output()
+        self.outputs = {
+            'Out': self.output,
+            'Pivots': self.Pivots,
+            'Infos': self.Infos
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+# m = n 2D
+class TestLUOp2(TestLUOp):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = [10, 10]
+        self.pivot = True
+        self.get_infos = True
+        self.dtype = "float64"
+
+
+# m > n
+class TestLUOp3(TestLUOp):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = [2, 12, 10]
+        self.pivot = True
+        self.get_infos = True
+        self.dtype = "float64"
+
+
+class TestLUAPI(unittest.TestCase):
+    def test_dygraph(self):
+        def run_lu_dygraph(shape, dtype):
+            if dtype == "float32":
+                np_dtype = np.float32
+            elif dtype == "float64":
+                np_dtype = np.float64
+            a = np.random.rand(*shape).astype(np_dtype)
+            m = a.shape[-2]
+            n = a.shape[-1]
+            min_mn = min(m, n)
+            pivot = True
+
+            places = [fluid.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(fluid.CUDAPlace(0))
+            for place in places:
+                paddle.disable_static(place)
+                batch_size = a.size // (a.shape[-1] * a.shape[-2])
+                x = paddle.to_tensor(a, dtype=dtype)
+                sP, sl, sU = scipy_lu(a, pivot)
+                sL = np.tril(sl, -1)
+                LU, P, Info = paddle.linalg.lu(x, pivot=pivot, get_infos=True)
+                m, n = LU.shape[-2], LU.shape[-1]
+                tril = np.tril(LU, -1)[..., :m, :m]
+                triu = np.triu(LU)[..., :n, :n]
+                mtp = Pmat_to_perm(sP, min(m, n))
+                nP = perm_to_Pmat(P, sP.shape[-1])
+
+                self.assertTrue(np.allclose(sU, triu, atol=1e-5))
+                self.assertTrue(np.allclose(sL, tril, atol=1e-5))
+                self.assertTrue(np.allclose(P, mtp, atol=1e-5))
+                self.assertTrue(np.allclose(nP, sP, atol=1e-5))
+
+        tensor_shapes = [
+            (3, 5),
+            (5, 5),
+            (5, 3),  # 2-dim Tensors 
+            (2, 3, 5),
+            (3, 5, 5),
+            (4, 5, 3),  # 3-dim Tensors
+            (2, 5, 3, 5),
+            (3, 5, 5, 5),
+            (4, 5, 5, 3)  # 4-dim Tensors
+        ]
+        dtypes = ["float32", "float64"]
+        for tensor_shape, dtype in itertools.product(tensor_shapes, dtypes):
+            run_lu_dygraph(tensor_shape, dtype)
+
+    def test_static(self):
+        paddle.enable_static()
+
+        def run_lu_static(shape, dtype):
+            if dtype == "float32":
+                np_dtype = np.float32
+            elif dtype == "float64":
+                np_dtype = np.float64
+            a = np.random.rand(*shape).astype(np_dtype)
+            m = a.shape[-2]
+            n = a.shape[-1]
+            min_mn = min(m, n)
+            pivot = True
+
+            places = []
+            places = [fluid.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(fluid.CUDAPlace(0))
+            for place in places:
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    batch_size = a.size // (a.shape[-1] * a.shape[-2])
+                    sP, sl, sU = scipy_lu(a, pivot)
+                    sL = np.tril(sl, -1)
+                    ashape = np.array(a.shape)
+                    lshape = np.array(sL.shape)
+                    ushape = np.array(sU.shape)
+
+                    lpad = (len(sL.shape) - 2) * [(0, 0)] + list((
+                        (0, (ashape - lshape)[-2]), (0, (ashape - lshape)[-1])))
+                    upad = (len(sU.shape) - 2) * [(0, 0)] + list((
+                        (0, (ashape - ushape)[-2]), (0, (ashape - ushape)[-1])))
+
+                    NsL = np.pad(sL, lpad)
+                    NsU = np.pad(sU, upad)
+                    NLU = NsL + NsU
+
+                    x = paddle.fluid.data(
+                        name="input", shape=shape, dtype=dtype)
+                    lu, p = paddle.linalg.lu(x, pivot=pivot)
+                    exe = fluid.Executor(place)
+                    fetches = exe.run(fluid.default_main_program(),
+                                      feed={"input": a},
+                                      fetch_list=[lu, p])
+                    self.assertTrue(np.allclose(fetches[0], NLU, atol=1e-5))
+
+        tensor_shapes = [
+            (3, 5),
+            (5, 5),
+            (5, 3),  # 2-dim Tensors 
+            (2, 3, 5),
+            (3, 5, 5),
+            (4, 5, 3),  # 3-dim Tensors
+            (2, 5, 3, 5),
+            (3, 5, 5, 5),
+            (4, 5, 5, 3)  # 4-dim Tensors
+        ]
+        dtypes = ["float32", "float64"]
+        for tensor_shape, dtype in itertools.product(tensor_shapes, dtypes):
+            run_lu_static(tensor_shape, dtype)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
new file mode 100644
index 0000000000000..0aff38cb78543
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
@@ -0,0 +1,280 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from op_test import OpTest
+import unittest
+import itertools
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+import scipy
+import scipy.linalg
+import copy
+
+
+def scipy_lu_unpack(A):
+    shape = A.shape
+    if len(shape) == 2:
+        return scipy.linalg.lu(A)
+    else:
+        preshape = shape[:-2]
+        batchsize = np.product(shape) // (shape[-2] * shape[-1])
+        Plst = []
+        Llst = []
+        Ulst = []
+
+        NA = A.reshape((-1, shape[-2], shape[-1]))
+        for b in range(batchsize):
+            As = NA[b]
+            P, L, U = scipy.linalg.lu(As)
+
+            pshape = P.shape
+            lshape = L.shape
+            ushape = U.shape
+
+            Plst.append(P)
+            Llst.append(L)
+            Ulst.append(U)
+
+        return np.array(Plst).reshape(preshape + pshape), np.array(
+            Llst).reshape(preshape + lshape), np.array(Ulst).reshape(preshape +
+                                                                     ushape)
+
+
+def Pmat_to_perm(Pmat_org, cut):
+    Pmat = copy.deepcopy(Pmat_org)
+    shape = Pmat.shape
+    rows = shape[-2]
+    cols = shape[-1]
+    batchsize = max(1, np.product(shape[:-2]))
+    P = Pmat.reshape(batchsize, rows, cols)
+    permmat = []
+    for b in range(batchsize):
+        permlst = []
+        sP = P[b]
+        for c in range(min(rows, cols)):
+            idx = np.argmax(sP[:, c])
+            permlst.append(idx)
+            tmp = copy.deepcopy(sP[c, :])
+            sP[c, :] = sP[idx, :]
+            sP[idx, :] = tmp
+
+        permmat.append(permlst)
+    Pivot = np.array(permmat).reshape(list(shape[:-2]) + [rows, ]) + 1
+
+    return Pivot[..., :cut]
+
+
+def perm_to_Pmat(perm, dim):
+    pshape = perm.shape
+    bs = int(np.product(perm.shape[:-1]).item())
+    perm = perm.reshape((bs, pshape[-1]))
+    oneslst = []
+    for i in range(bs):
+        idlst = np.arange(dim)
+        perm_item = perm[i, :]
+        for idx, p in enumerate(perm_item - 1):
+            temp = idlst[idx]
+            idlst[idx] = idlst[p]
+            idlst[p] = temp
+
+        ones = paddle.eye(dim)
+        nmat = paddle.scatter(ones, paddle.to_tensor(idlst), ones)
+        oneslst.append(nmat)
+    return np.array(oneslst).reshape(list(pshape[:-1]) + [dim, dim])
+
+
+# m > n
+class TestLU_UnpackOp(OpTest):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = [2, 12, 10]
+        self.unpack_ludata = True
+        self.unpack_pivots = True
+        self.dtype = "float64"
+
+    def set_output(self, A):
+        sP, sL, sU = scipy_lu_unpack(A)
+        self.L = sL
+        self.U = sU
+        self.P = sP
+
+    def setUp(self):
+        self.op_type = "lu_unpack"
+        self.config()
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        if paddle.in_dynamic_mode():
+            xt = paddle.to_tensor(x)
+            lu, pivots = paddle.linalg.lu(xt)
+            lu = lu.numpy()
+            pivots = pivots.numpy()
+        else:
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                place = fluid.CPUPlace()
+                if core.is_compiled_with_cuda():
+                    place = fluid.CUDAPlace(0)
+                xv = paddle.fluid.data(
+                    name="input", shape=self.x_shape, dtype=self.dtype)
+                lu, p = paddle.linalg.lu(xv)
+                exe = fluid.Executor(place)
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={"input": x},
+                                  fetch_list=[lu, p])
+                lu, pivots = fetches[0], fetches[1]
+
+        self.inputs = {'X': lu, 'Pivots': pivots}
+
+        self.attrs = {
+            'unpack_ludata': self.unpack_ludata,
+            'unpack_pivots': self.unpack_pivots
+        }
+        self.set_output(x)
+        self.outputs = {
+            'Pmat': self.P,
+            'L': self.L,
+            'U': self.U,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['L', 'U'])
+
+
+# m = n
+class TestLU_UnpackOp2(TestLU_UnpackOp):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = [2, 10, 10]
+        self.unpack_ludata = True
+        self.unpack_pivots = True
+        self.dtype = "float64"
+
+
+# m < n
+class TestLU_UnpackOp3(TestLU_UnpackOp):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = [2, 10, 12]
+        self.unpack_ludata = True
+        self.unpack_pivots = True
+        self.dtype = "float64"
+
+
+class TestLU_UnpackAPI(unittest.TestCase):
+    def test_dygraph(self):
+        def run_lu_unpack_dygraph(shape, dtype):
+            if dtype == "float32":
+                np_dtype = np.float32
+            elif dtype == "float64":
+                np_dtype = np.float64
+            a = np.random.rand(*shape).astype(np_dtype)
+            m = a.shape[-2]
+            n = a.shape[-1]
+            min_mn = min(m, n)
+
+            places = [fluid.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(fluid.CUDAPlace(0))
+            for place in places:
+                paddle.disable_static(place)
+
+                x = paddle.to_tensor(a, dtype=dtype)
+                sP, sL, sU = scipy_lu_unpack(a)
+                LU, P = paddle.linalg.lu(x)
+                pP, pL, pU = paddle.linalg.lu_unpack(LU, P)
+
+                self.assertTrue(np.allclose(sU, pU, atol=1e-5))
+                self.assertTrue(np.allclose(sL, pL, atol=1e-5))
+                self.assertTrue(np.allclose(sP, pP, atol=1e-5))
+
+        tensor_shapes = [
+            (3, 5),
+            (5, 5),
+            (5, 3),  # 2-dim Tensors 
+            (2, 3, 5),
+            (3, 5, 5),
+            (4, 5, 3),  # 3-dim Tensors
+            (2, 5, 3, 5),
+            (3, 5, 5, 5),
+            (4, 5, 5, 3)  # 4-dim Tensors
+        ]
+        dtypes = ["float32", "float64"]
+        for tensor_shape, dtype in itertools.product(tensor_shapes, dtypes):
+            run_lu_unpack_dygraph(tensor_shape, dtype)
+
+    def test_static(self):
+        paddle.enable_static()
+
+        def run_lu_static(shape, dtype):
+            if dtype == "float32":
+                np_dtype = np.float32
+            elif dtype == "float64":
+                np_dtype = np.float64
+            a = np.random.rand(*shape).astype(np_dtype)
+            m = a.shape[-2]
+            n = a.shape[-1]
+            min_mn = min(m, n)
+
+            places = [fluid.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(fluid.CUDAPlace(0))
+            for place in places:
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    sP, sL, sU = scipy_lu_unpack(a)
+
+                    x = paddle.fluid.data(
+                        name="input", shape=shape, dtype=dtype)
+                    lu, p = paddle.linalg.lu(x)
+                    pP, pL, pU = paddle.linalg.lu_unpack(lu, p)
+                    exe = fluid.Executor(place)
+                    fetches = exe.run(fluid.default_main_program(),
+                                      feed={"input": a},
+                                      fetch_list=[pP, pL, pU])
+                    self.assertTrue(np.allclose(fetches[0], sP, atol=1e-5))
+                    self.assertTrue(np.allclose(fetches[1], sL, atol=1e-5))
+                    self.assertTrue(np.allclose(fetches[2], sU, atol=1e-5))
+
+        tensor_shapes = [
+            (3, 5),
+            (5, 5),
+            (5, 3),  # 2-dim Tensors 
+            (2, 3, 5),
+            (3, 5, 5),
+            (4, 5, 3),  # 3-dim Tensors
+            (2, 5, 3, 5),
+            (3, 5, 5, 5),
+            (4, 5, 5, 3)  # 4-dim Tensors
+        ]
+        dtypes = ["float32", "float64"]
+        for tensor_shape, dtype in itertools.product(tensor_shapes, dtypes):
+            run_lu_static(tensor_shape, dtype)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py b/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py
new file mode 100644
index 0000000000000..fe00a825ba1cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_max_min_amax_amin_op.py
@@ -0,0 +1,179 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestMaxMinAmaxAminAPI(unittest.TestCase):
+    def setUp(self):
+        self.init_case()
+        self.cal_np_out_and_gradient()
+        self.place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+    def init_case(self):
+        self.x_np = np.array([[0.2, 0.3, 0.5, 0.9], [0.1, 0.2, 0.6, 0.7]])
+        self.shape = [2, 4]
+        self.dtype = 'float64'
+        self.axis = 0
+        self.keepdim = False
+
+    # If there are multiple minimum or maximum elements, max/min/amax/amin is non-derivable,
+    # its gradient check is not supported by unittest framework, 
+    # thus we calculate the gradient by numpy function.
+    def cal_np_out_and_gradient(self):
+        def _cal_np_out_and_gradient(func):
+            if func is 'amax':
+                out = np.amax(self.x_np, axis=self.axis, keepdims=self.keepdim)
+            elif func is 'amin':
+                out = np.amin(self.x_np, axis=self.axis, keepdims=self.keepdim)
+            elif func is 'max':
+                out = np.max(self.x_np, axis=self.axis, keepdims=self.keepdim)
+            elif func is 'min':
+                out = np.min(self.x_np, axis=self.axis, keepdims=self.keepdim)
+            else:
+                print('This unittest only test amax/amin/max/min, but now is',
+                      func)
+            self.np_out[func] = out
+            grad = np.zeros(self.shape)
+            out_b = np.broadcast_to(out.view(), self.shape)
+            grad[self.x_np == out_b] = 1
+            if func in ['amax', 'amin']:
+                grad_sum = grad.sum(self.axis).reshape(out.shape)
+                grad_b = np.broadcast_to(grad_sum, self.shape)
+                grad /= grad_sum
+
+            self.np_grad[func] = grad
+
+        self.np_out = dict()
+        self.np_grad = dict()
+        _cal_np_out_and_gradient('amax')
+        _cal_np_out_and_gradient('amin')
+        _cal_np_out_and_gradient('max')
+        _cal_np_out_and_gradient('min')
+
+    def _choose_paddle_func(self, func, x):
+        if func is 'amax':
+            out = paddle.amax(x, self.axis, self.keepdim)
+        elif func is 'amin':
+            out = paddle.amin(x, self.axis, self.keepdim)
+        elif func is 'max':
+            out = paddle.max(x, self.axis, self.keepdim)
+        elif func is 'min':
+            out = paddle.min(x, self.axis, self.keepdim)
+        else:
+            print('This unittest only test amax/amin/max/min, but now is', func)
+        return out
+
+    # We check the output between paddle API and numpy in static graph.
+    def test_static_graph(self):
+        def _test_static_graph(func):
+            startup_program = fluid.Program()
+            train_program = fluid.Program()
+            with fluid.program_guard(startup_program, train_program):
+                x = fluid.data(name='input', dtype=self.dtype, shape=self.shape)
+                x.stop_gradient = False
+                out = self._choose_paddle_func(func, x)
+
+                exe = fluid.Executor(self.place)
+                res = exe.run(fluid.default_main_program(),
+                              feed={'input': self.x_np},
+                              fetch_list=[out])
+                self.assertTrue((np.array(res[0]) == self.np_out[func]).all())
+
+        _test_static_graph('amax')
+        _test_static_graph('amin')
+        _test_static_graph('max')
+        _test_static_graph('min')
+
+    # As dygraph is easy to compute gradient, we check the gradient between 
+    # paddle API and numpy in dygraph.
+    def test_dygraph(self):
+        def _test_dygraph(func):
+            paddle.disable_static()
+            x = paddle.to_tensor(
+                self.x_np, dtype=self.dtype, stop_gradient=False)
+            out = self._choose_paddle_func(func, x)
+            grad_tensor = paddle.ones_like(x)
+            paddle.autograd.backward([out], [grad_tensor], True)
+
+            self.assertEqual(np.allclose(self.np_out[func], out.numpy()), True)
+            self.assertEqual(np.allclose(self.np_grad[func], x.grad), True)
+            paddle.enable_static()
+
+        _test_dygraph('amax')
+        _test_dygraph('amin')
+        _test_dygraph('max')
+        _test_dygraph('min')
+
+
+    # test two minimum or maximum elements
+class TestMaxMinAmaxAminAPI2(TestMaxMinAmaxAminAPI):
+    def init_case(self):
+        self.x_np = np.array([[0.2, 0.3, 0.9, 0.9], [0.1, 0.1, 0.6, 0.7]])
+        self.shape = [2, 4]
+        self.dtype = 'float64'
+        self.axis = None
+        self.keepdim = False
+
+
+# test different axis
+class TestMaxMinAmaxAminAPI3(TestMaxMinAmaxAminAPI):
+    def init_case(self):
+        self.x_np = np.array([[0.2, 0.3, 0.9, 0.9], [0.1, 0.1, 0.6, 0.7]])
+        self.shape = [2, 4]
+        self.dtype = 'float64'
+        self.axis = 0
+        self.keepdim = False
+
+
+# test keepdim = True
+class TestMaxMinAmaxAminAPI4(TestMaxMinAmaxAminAPI):
+    def init_case(self):
+        self.x_np = np.array([[0.2, 0.3, 0.9, 0.9], [0.1, 0.1, 0.6, 0.7]])
+        self.shape = [2, 4]
+        self.dtype = 'float64'
+        self.axis = 1
+        self.keepdim = True
+
+
+# test axis is tuple
+class TestMaxMinAmaxAminAPI5(TestMaxMinAmaxAminAPI):
+    def init_case(self):
+        self.x_np = np.array(
+            [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype(np.int32)
+        self.shape = [2, 2, 2]
+        self.dtype = 'int32'
+        self.axis = (0, 1)
+        self.keepdim = False
+
+
+# test multiple minimum or maximum elements
+class TestMaxMinAmaxAminAPI6(TestMaxMinAmaxAminAPI):
+    def init_case(self):
+        self.x_np = np.array([[0.2, 0.9, 0.9, 0.9], [0.9, 0.9, 0.2, 0.2]])
+        self.shape = [2, 4]
+        self.dtype = 'float64'
+        self.axis = None
+        self.keepdim = False
diff --git a/python/paddle/fluid/tests/unittests/test_max_op.py b/python/paddle/fluid/tests/unittests/test_max_op.py
index caee7d9e5c2ba..5e413e80d7143 100644
--- a/python/paddle/fluid/tests/unittests/test_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
@@ -98,6 +98,15 @@ def test_big_dimension(self):
         self.assertEqual((np_z1 == z_expected).all(), True)
         self.assertEqual((np_z2 == z_expected).all(), True)
 
+    def test_all_negative_axis(self):
+        paddle.disable_static()
+        x = paddle.rand(shape=[2, 2])
+        np_x = x.numpy()
+        z1 = paddle.max(x, axis=(-2, -1))
+        np_z1 = z1.numpy()
+        z_expected = np.array(np.max(np_x, axis=(0, 1)))
+        self.assertEqual((np_z1 == z_expected).all(), True)
+
 
 class TestOutDtype(unittest.TestCase):
     def test_max(self):
diff --git a/python/paddle/fluid/tests/unittests/test_merged_adam_op.py b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
new file mode 100644
index 0000000000000..f515a9f95b109
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_merged_adam_op.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+from paddle import _C_ops
+
+
+def run_adam_op(params,
+                grads,
+                lrs,
+                moment1s,
+                moment2s,
+                beta1_pows,
+                beta2_pows,
+                master_params,
+                epsilon,
+                beta1,
+                beta2,
+                place,
+                multi_precision=False,
+                use_merged=False):
+    assert len(params) == len(grads)
+    assert len(params) == len(lrs)
+    assert len(params) == len(moment1s)
+    assert len(params) == len(moment2s)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(master_params)
+    paddle.disable_static()
+    paddle.set_device(place)
+
+    param_vars = [paddle.fluid.dygraph.to_variable(p) for p in params]
+    grad_vars = [paddle.fluid.dygraph.to_variable(g) for g in grads]
+    lr_vars = [paddle.fluid.dygraph.to_variable(l) for l in lrs]
+    moment1_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment1s]
+    moment2_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment2s]
+    beta1_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta1_pows]
+    beta2_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta2_pows]
+    master_param_vars = [
+        paddle.fluid.dygraph.to_variable(m_p) for m_p in master_params
+    ]
+
+    if not use_merged:
+        for i in range(len(param_vars)):
+            _, _, _, _, _, _ = _C_ops.adam(
+                param_vars[i], grad_vars[i], lr_vars[i], moment1_vars[i],
+                moment2_vars[i], beta1_pow_vars[i], beta2_pow_vars[i],
+                master_param_vars[i], param_vars[i], moment1_vars[i],
+                moment2_vars[i], beta1_pow_vars[i], beta2_pow_vars[i],
+                master_param_vars[i], 'epsilon', epsilon, 'beta1', beta1,
+                'beta2', beta2, 'multi_precision', multi_precision)
+    else:
+        _, _, _, _, _, _ = _C_ops.merged_adam(
+            param_vars, grad_vars, lr_vars, moment1_vars, moment2_vars,
+            beta1_pow_vars, beta2_pow_vars, master_param_vars, param_vars,
+            moment1_vars, moment2_vars, beta1_pow_vars, beta2_pow_vars,
+            master_param_vars, 'epsilon', epsilon, 'beta1', beta1, 'beta2',
+            beta2, 'multi_precision', multi_precision)
+
+    outputs = {
+        'ParamOut': param_vars,
+        'Moment1Out': moment1_vars,
+        'Moment2Out': moment2_vars,
+        'Beta1PowOut': beta1_pow_vars,
+        'Beta2PowOut': beta2_pow_vars,
+        'MasterParamOut': master_param_vars
+    }
+
+    return outputs
+
+
+class TestMergedAdam(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float16 if multi_precision and place == 'gpu' else np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        lrs = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        moment1s = self.gen_rand_data(shapes, mp_dtype)
+        moment2s = self.gen_rand_data(shapes, mp_dtype)
+        beta1_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        beta2_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        master_params = [p.astype(mp_dtype) for p in params]
+        return params, grads, lrs, moment1s, moment2s, beta1_pows, beta2_pows, master_params
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, lrs, moment1s, moment2s, beta1_pows, beta2_pows, master_params = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_merged):
+            return run_adam_op(
+                params=params,
+                grads=grads,
+                lrs=lrs,
+                moment1s=moment1s,
+                moment2s=moment2s,
+                beta1_pows=beta1_pows,
+                beta2_pows=beta2_pows,
+                master_params=master_params,
+                epsilon=0.9,
+                beta1=0.9,
+                beta2=0.99,
+                place=place,
+                multi_precision=multi_precision,
+                use_merged=use_merged)
+
+        outs1 = run_op(True)
+        outs2 = run_op(False)
+        self.assertEqual(len(outs1), len(outs2))
+
+        for key in outs1.keys():
+            value1 = outs1[key]
+            value2 = outs2[key]
+            for i in range(len(value1)):
+                if place == 'gpu':
+                    self.assertTrue(np.array_equal(value1[i], value2[i]))
+                else:
+                    self.assertTrue(
+                        np.allclose(
+                            value1[i], value2[i], atol=1e-7))
+
+    def get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        return places
+
+    def test_main(self):
+        for multi_precision in [False, True]:
+            for place in self.get_places():
+                self.check_with_place(place, multi_precision)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mish_op.py b/python/paddle/fluid/tests/unittests/test_mish_op.py
deleted file mode 100644
index 8cc785e450f0b..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_mish_op.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import six
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import Program, program_guard
-from op_test import OpTest, skip_check_grad_ci
-
-
-class TestMishOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program()):
-            # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.mish, 0.1, 20)
-            # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.mish, x_int32, 20)
-            # support the input dtype is float32
-            x_fp16 = fluid.layers.data(
-                name='x_fp16', shape=[12, 10], dtype='float32')
-            fluid.layers.mish(x_fp16, threshold=20)
-
-
-class MishTest(OpTest):
-    def setUp(self):
-        self.init_dtype()
-        self.init_input_shape()
-        self.init_input_range()
-        self.init_threshold()
-        self.op_type = "mish"
-
-        x_np = np.random.uniform(self.x_range[0], self.x_range[1],
-                                 self.x_shape).astype(self.dtype)
-        self.inputs = {'X': x_np}
-
-        softplus = x_np * (x_np > self.threshold) + np.exp(x_np) * \
-                    (x_np < -self.threshold) + np.log(np.exp(x_np) + 1.) * \
-                    (x_np >= -self.threshold) * (x_np <= self.threshold)
-        out_np = x_np * np.tanh(softplus)
-
-        self.outputs = {'Out': out_np}
-        self.attrs = {'threshold': self.threshold}
-
-    def init_dtype(self):
-        self.dtype = 'float32'
-
-    def init_input_shape(self):
-        self.x_shape = (10, 12)
-
-    def init_input_range(self):
-        self.x_range = [-1, 1]
-
-    def init_threshold(self):
-        self.threshold = 5.
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class MishTestUpperThresh(MishTest):
-    def init_input_range(self):
-        self.x_range = [6, 7]
-
-
-class MishTestLowerThresh(MishTest):
-    def init_input_range(self):
-        self.x_range = [-7, -6]
-
-
-# mish op contain calculation like: tanh, exp, log, while tanh
-# may have diff on CPUPlace(see test_activation_op.py::TestTanh),
-# especially when abs(x) is a large value, only check input value
-# in range [-1, 1] for float64 here.
-class MishTestFP64(MishTest):
-    def init_dtype(self):
-        self.dtype = 'float64'
-
-    def init_input_range(self):
-        self.x_range = [-1, 1]
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mode_op.py b/python/paddle/fluid/tests/unittests/test_mode_op.py
new file mode 100644
index 0000000000000..1b0458f2e255f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_mode_op.py
@@ -0,0 +1,178 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+
+def _mode1D(a):
+    sorted_inds = np.argsort(a, kind='stable')
+    sorted_array = a[sorted_inds]
+    max_freq = 0
+    cur_freq = 0
+    mode = -1
+    for i in range(len(sorted_array)):
+        cur_freq += 1
+        if i == len(sorted_array) - 1 or sorted_array[i] != sorted_array[i + 1]:
+            if cur_freq > max_freq:
+                mode = sorted_array[i]
+                index = sorted_inds[i]
+                max_freq = cur_freq
+        cur_freq = 0
+    return mode, index
+
+
+def cal_mode(a, axis, keepdim=False):
+    if axis < 0:
+        axis = len(a.shape) + axis
+    in_dims = list(range(a.ndim))
+    a_view = np.transpose(a, in_dims[:axis] + in_dims[axis + 1:] + [axis])
+    inds = np.ndindex(a_view.shape[:-1])
+    modes = np.empty(a_view.shape[:-1], dtype=a.dtype)
+    indexes = np.empty(a_view.shape[:-1], dtype=np.int64)
+    for ind in inds:
+        modes[ind], indexes[ind] = _mode1D(a_view[ind])
+    if keepdim:
+        newshape = list(a.shape)
+        newshape[axis] = 1
+        modes = modes.reshape(newshape)
+        indexes = indexes.reshape(newshape)
+    return modes, indexes
+
+
+class TestModeOp(OpTest):
+    def init_args(self):
+        self.axis = 1
+
+    def setUp(self):
+        self.op_type = "mode"
+        self.dtype = np.float64
+        np.random.seed(666)
+        self.input_data = np.random.rand(2, 64, 1)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'axis': self.axis}
+        output, indices = cal_mode(self.input_data, axis=self.axis)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestModeOpLastdim(OpTest):
+    def init_args(self):
+        self.axis = -1
+
+    def setUp(self):
+        self.op_type = "mode"
+        self.dtype = np.float64
+        np.random.seed(666)
+        self.input_data = np.random.rand(2, 1, 1, 2, 30)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'axis': self.axis}
+        output, indices = cal_mode(self.input_data, axis=self.axis)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def test_check_grad(self):
+        paddle.enable_static()
+        self.check_grad(set(['X']), 'Out')
+
+
+class TestModeOpKernels(unittest.TestCase):
+    def setUp(self):
+        self.axises = [-1, 1]
+        np.random.seed(666)
+        self.inputs = np.ceil(np.random.rand(2, 10, 10) * 1000)
+
+    def test_mode_op(self):
+        def test_cpu_kernel():
+            paddle.set_device('cpu')
+            tensor = paddle.to_tensor(self.inputs)
+            for axis in self.axises:
+                value_expect, indice_expect = cal_mode(self.inputs, axis)
+                v, inds = paddle.mode(tensor, axis)
+                self.assertTrue(np.allclose(v.numpy(), value_expect))
+
+                value_expect, indice_expect = cal_mode(
+                    self.inputs, axis, keepdim=True)
+                v, inds = paddle.mode(tensor, axis, keepdim=True)
+                self.assertTrue(np.allclose(v.numpy(), value_expect))
+
+        def test_gpu_kernel():
+            paddle.set_device('gpu')
+            tensor = paddle.to_tensor(self.inputs)
+            for axis in self.axises:
+                value_expect, indice_expect = cal_mode(self.inputs, axis)
+                v, inds = paddle.mode(tensor, axis)
+                self.assertTrue(np.allclose(v.numpy(), value_expect))
+
+                value_expect, indice_expect = cal_mode(
+                    self.inputs, axis, keepdim=True)
+                v, inds = paddle.mode(tensor, axis, keepdim=True)
+                self.assertTrue(np.allclose(v.numpy(), value_expect))
+
+        paddle.disable_static()
+        test_cpu_kernel()
+        if fluid.core.is_compiled_with_cuda():
+            test_gpu_kernel()
+
+
+class TestModeOpErrors(unittest.TestCase):
+    def setUp(self):
+        self.x = paddle.uniform([2, 10, 20, 25], dtype='float32')
+
+        def test_dim_range_error():
+            self.x.mode(axis=5)
+
+        self.assertRaises(ValueError, test_dim_range_error)
+
+
+class TestModeOpInStatic(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(666)
+        self.input_data = np.ceil(
+            np.random.random((2, 10, 10)) * 1000, dtype=np.float64)
+
+    def test_run_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[2, 10, 10], dtype="float64")
+
+            result = paddle.mode(input_tensor, axis=1)
+            expect_value = cal_mode(self.input_data, axis=1)[0]
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            paddle_result = exe.run(feed={"x": self.input_data},
+                                    fetch_list=[result])[0]
+            self.assertTrue(np.allclose(paddle_result, expect_value))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nansum_api.py b/python/paddle/fluid/tests/unittests/test_nansum_api.py
new file mode 100644
index 0000000000000..a9fc285d2d9d0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nansum_api.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+
+
+class API_Test_Nansum(unittest.TestCase):
+    def test_static_graph(self):
+        paddle.enable_static()
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 4])
+            out1 = paddle.nansum(input)
+            out2 = paddle.nansum(input, axis=0)
+            out3 = paddle.nansum(input, axis=-1)
+            out4 = paddle.nansum(input, axis=1, keepdim=True)
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            x = np.array([[float('nan'), 3, 5, 9],
+                          [1, 2, float('-nan'), 7]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': x},
+                          fetch_list=[out1, out2, out3, out4])
+
+            out1_np = np.array(res[0])
+            out2_np = np.array(res[1])
+            out3_np = np.array(res[2])
+            out4_np = np.array(res[3])
+            out1_ref = np.array([27]).astype(np.float32)
+            out2_ref = np.array([1, 5, 5, 16]).astype(np.float32)
+            out3_ref = np.array([17, 10]).astype(np.float32)
+            out4_ref = np.array([[17], [10]]).astype(np.float32)
+
+            self.assertTrue(
+                (out1_np == out1_ref).all(),
+                msg='nansum output is wrong, out =' + str(out1_np))
+            self.assertTrue(
+                (out2_np == out2_ref).all(),
+                msg='nansum output is wrong, out =' + str(out2_np))
+            self.assertTrue(
+                (out3_np == out3_ref).all(),
+                msg='nansum output is wrong, out =' + str(out3_np))
+            self.assertTrue(
+                (out4_np == out4_ref).all(),
+                msg='nansum output is wrong, out =' + str(out4_np))
+
+    def test_error_api(self):
+        paddle.enable_static()
+
+        ## input dtype error
+        def run1():
+            input = fluid.data(name='input', dtype='float16', shape=[2, 3])
+            output = paddle.nansum(input)
+
+        self.assertRaises(TypeError, run1)
+
+        ## axis type error
+        def run2():
+            input = fluid.data(name='input', dtype='float16', shape=[2, 3])
+            output = paddle.nansum(input, axis=1.2)
+
+        self.assertRaises(TypeError, run2)
+
+    def test_dygraph(self):
+        x = np.array([[float('nan'), 3, 5, 9],
+                      [1, 2, float('-nan'), 7]]).astype(np.float32)
+        with fluid.dygraph.guard():
+            inputs = fluid.dygraph.to_variable(x)
+            out = paddle.nansum(inputs)
+            out_ref = np.array([27]).astype(np.float32)
+
+            self.assertTrue(
+                (out.numpy() == out_ref).all(),
+                msg='nansum output is wrong, out =' + str(out.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_op_function_generator.py b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
index c59eca90d7536..e3e7451c423b5 100644
--- a/python/paddle/fluid/tests/unittests/test_op_function_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_op_function_generator.py
@@ -21,6 +21,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.jit import TracedLayer
 import numpy as np
+from paddle import _C_ops
 
 
 class TestTracedLayer(fluid.dygraph.Layer):
@@ -28,7 +29,7 @@ def __init__(self, name_scope):
         super(TestTracedLayer, self).__init__(name_scope)
 
     def forward(self, input):
-        return core.ops.relu(input)
+        return _C_ops.relu(input)
 
 
 class TestVariable(unittest.TestCase):
@@ -46,7 +47,7 @@ def test_elementwise_add(self):
             x.stop_gradient = False
 
             res1 = layers.elementwise_add(x, y)
-            res2 = core.ops.elementwise_add(x, y)
+            res2 = _C_ops.elementwise_add(x, y)
 
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
@@ -58,7 +59,7 @@ def test_elementwise_mul(self):
             y = fluid.dygraph.to_variable(b)
 
             res1 = layers.elementwise_mul(x, y)
-            res2 = core.ops.elementwise_mul(x, y)
+            res2 = _C_ops.elementwise_mul(x, y)
 
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
@@ -68,7 +69,7 @@ def test_relu(self):
             x = fluid.dygraph.to_variable(a)
 
             res1 = layers.relu(x)
-            res2 = core.ops.relu(x)
+            res2 = _C_ops.relu(x)
 
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
@@ -81,7 +82,7 @@ def test_trace_backward(self):
             x.stop_gradient = False
             y.stop_gradient = False
 
-            loss = core.ops.elementwise_mul(x, y)
+            loss = _C_ops.elementwise_mul(x, y)
 
             loss.backward()
             x_grad = x.gradient()
diff --git a/python/paddle/fluid/tests/unittests/test_outer.py b/python/paddle/fluid/tests/unittests/test_outer.py
new file mode 100644
index 0000000000000..1b11a71bb2f09
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_outer.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.static import Program, program_guard
+
+
+class TestMultiplyApi(unittest.TestCase):
+    def _run_static_graph_case(self, x_data, y_data):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+            x = paddle.static.data(
+                name='x', shape=x_data.shape, dtype=x_data.dtype)
+            y = paddle.static.data(
+                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            res = paddle.outer(x, y)
+
+            place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            outs = exe.run(paddle.static.default_main_program(),
+                           feed={'x': x_data,
+                                 'y': y_data},
+                           fetch_list=[res])
+            res = outs[0]
+            return res
+
+    def _run_dynamic_graph_case(self, x_data, y_data):
+        paddle.disable_static()
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        res = paddle.outer(x, y)
+        return res.numpy()
+
+    def test_multiply(self):
+        np.random.seed(7)
+
+        # test static computation graph: 3-d array
+        x_data = np.random.rand(2, 10, 10).astype(np.float64)
+        y_data = np.random.rand(2, 5, 10).astype(np.float64)
+        res = self._run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
+
+        # test static computation graph: 2-d array
+        x_data = np.random.rand(200, 5).astype(np.float64)
+        y_data = np.random.rand(50, 5).astype(np.float64)
+        res = self._run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
+
+        # test static computation graph: 1-d array
+        x_data = np.random.rand(50).astype(np.float64)
+        y_data = np.random.rand(50).astype(np.float64)
+        res = self._run_static_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
+
+        # test dynamic computation graph: 3-d array
+        x_data = np.random.rand(5, 10, 10).astype(np.float64)
+        y_data = np.random.rand(2, 10).astype(np.float64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
+
+        # test dynamic computation graph: 2-d array
+        x_data = np.random.rand(20, 50).astype(np.float64)
+        y_data = np.random.rand(50).astype(np.float64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
+
+        # test dynamic computation graph: Scalar
+        x_data = np.random.rand(20, 10).astype(np.float32)
+        y_data = np.random.rand(1).astype(np.float32).item()
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.outer(x_data, y_data), rtol=1e4))
+
+        # test dynamic computation graph: 2-d array Complex
+        x_data = np.random.rand(20,
+                                50).astype(np.float64) + 1J * np.random.rand(
+                                    20, 50).astype(np.float64)
+        y_data = np.random.rand(50).astype(np.float64) + 1J * np.random.rand(
+            50).astype(np.float64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
+
+        # test dynamic computation graph: 3-d array Complex
+        x_data = np.random.rand(5, 10,
+                                10).astype(np.float64) + 1J * np.random.rand(
+                                    5, 10, 10).astype(np.float64)
+        y_data = np.random.rand(2, 10).astype(np.float64) + 1J * np.random.rand(
+            2, 10).astype(np.float64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.outer(x_data, y_data)))
+
+
+class TestMultiplyError(unittest.TestCase):
+    def test_errors(self):
+        # test static computation graph: dtype can not be int8
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name='x', shape=[100], dtype=np.int8)
+            y = paddle.static.data(name='y', shape=[100], dtype=np.int8)
+            self.assertRaises(TypeError, paddle.outer, x, y)
+
+        np.random.seed(7)
+        # test dynamic computation graph: dtype can not be int8
+        paddle.disable_static()
+        x_data = np.random.randn(200).astype(np.int8)
+        y_data = np.random.randn(200).astype(np.int8)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(RuntimeError, paddle.outer, x, y)
+
+        # test dynamic computation graph: dtype must be same	
+        x_data = np.random.randn(200).astype(np.float32)
+        y_data = np.random.randn(200).astype(np.float64)
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(ValueError, paddle.outer, x, y)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float64)
+        y_data = np.random.randn(200).astype(np.float64)
+        y = paddle.to_tensor(y_data)
+        self.assertRaises(ValueError, paddle.outer, x_data, y)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float32)
+        y_data = np.random.randn(200).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        self.assertRaises(ValueError, paddle.outer, x, y_data)
+
+        # test dynamic computation graph: dtype must be Tensor type
+        x_data = np.random.randn(200).astype(np.float32)
+        y_data = np.random.randn(200).astype(np.float32)
+        self.assertRaises(ValueError, paddle.outer, x_data, y_data)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parameter.py b/python/paddle/fluid/tests/unittests/test_parameter.py
index 46e211f4729f0..85ba69cd438a7 100644
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
@@ -18,18 +18,19 @@
 import copy
 import paddle
 from paddle.fluid.dygraph import guard
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, Variable
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
 import paddle.fluid.io as io
 from paddle.fluid.initializer import ConstantInitializer
 import numpy as np
 
+paddle.enable_static()
 main_program = default_main_program()
 
 
 class ParameterChecks(unittest.TestCase):
-    def check_parameter(self):
+    def test_parameter(self):
         shape = [784, 100]
         val = 1.0625
         b = main_program.global_block()
@@ -43,13 +44,13 @@ def check_parameter(self):
         self.assertEqual((784, 100), param.shape)
         self.assertEqual(core.VarDesc.VarType.FP32, param.dtype)
         self.assertEqual(0, param.block.idx)
-        exe = Executor(core.CPUPlace())
+        exe = Executor(paddle.CPUPlace())
         p = exe.run(main_program, fetch_list=[param])[0]
-        self.assertTrue(np.allclose(p, np.ones(shape) * val))
+        self.assertTrue(np.array_equal(p, np.ones(shape) * val))
         p = io.get_parameter_value_by_name('fc.w', exe, main_program)
-        self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
+        self.assertTrue(np.array_equal(p, np.ones(shape) * val))
 
-    def check_parambase(self):
+    def test_parambase(self):
         with guard():
             linear = paddle.nn.Linear(10, 10)
             param = linear.weight
@@ -71,7 +72,7 @@ def check_parambase(self):
             pram_copy2 = copy.deepcopy(param, memo)
             self.assertEqual(id(param_copy), id(pram_copy2))
 
-    def check_exceptions(self):
+    def test_exception(self):
         b = main_program.global_block()
         with self.assertRaises(ValueError):
             b.create_parameter(
@@ -86,16 +87,30 @@ def check_exceptions(self):
             b.create_parameter(
                 name='test', shape=[-1], dtype='float32', initializer=None)
 
+    def test_parambase_to_vector(self):
+        with guard():
+            initializer = paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(3.))
+            linear1 = paddle.nn.Linear(10, 15, initializer)
 
-class TestParameter(ParameterChecks):
-    def _test_parameter(self):
-        self.check_parameter()
-
-    def test_parambase(self):
-        self.check_parambase()
+            vec = paddle.nn.utils.parameters_to_vector(linear1.parameters())
+            self.assertEqual(linear1.weight.shape, [10, 15])
+            self.assertEqual(linear1.bias.shape, [15])
+            self.assertTrue(isinstance(vec, Variable))
+            self.assertTrue(vec.shape, [165])
 
-    def test_exceptions(self):
-        self.check_exceptions()
+            linear2 = paddle.nn.Linear(10, 15)
+            paddle.nn.utils.vector_to_parameters(vec, linear2.parameters())
+            self.assertEqual(linear2.weight.shape, [10, 15])
+            self.assertEqual(linear2.bias.shape, [15])
+            self.assertTrue(
+                np.array_equal(linear1.weight.numpy(), linear2.weight.numpy()),
+                True)
+            self.assertTrue(
+                np.array_equal(linear1.bias.numpy(), linear2.bias.numpy()),
+                True)
+            self.assertTrue(linear2.weight.is_leaf, True)
+            self.assertTrue(linear2.bias.is_leaf, True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py
new file mode 100644
index 0000000000000..dc4dc3284e923
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+from op_test import OpTest
+import math
+
+paddle.enable_static()
+paddle.seed(100)
+
+
+def output_hist(out, lam, a, b):
+    prob = []
+    bin = []
+    for i in range(a, b + 1):
+        prob.append((lam**i) * math.exp(-lam) / math.factorial(i))
+        bin.append(i)
+    bin.append(b + 0.1)
+
+    hist, _ = np.histogram(out, bin)
+    hist = hist.astype("float32")
+    hist = hist / float(out.size)
+    return hist, prob
+
+
+class TestPoissonOp1(OpTest):
+    def setUp(self):
+        self.op_type = "poisson"
+        self.config()
+
+        self.attrs = {}
+        self.inputs = {'X': np.full([1024, 1024], self.lam, dtype=self.dtype)}
+        self.outputs = {'Out': np.ones([1024, 1024], dtype=self.dtype)}
+
+    def config(self):
+        self.lam = 10
+        self.a = 5
+        self.b = 15
+        self.dtype = "float64"
+
+    def verify_output(self, outs):
+        hist, prob = output_hist(np.array(outs[0]), self.lam, self.a, self.b)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0.01),
+            "actual: {}, expected: {}".format(hist, prob))
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[np.zeros(
+                [1024, 1024], dtype=self.dtype)],
+            user_defined_grad_outputs=[
+                np.random.rand(1024, 1024).astype(self.dtype)
+            ])
+
+
+class TestPoissonOp2(TestPoissonOp1):
+    def config(self):
+        self.lam = 5
+        self.a = 1
+        self.b = 9
+        self.dtype = "float32"
+
+
+class TestPoissonAPI(unittest.TestCase):
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x_np = np.random.rand(10, 10)
+            x = paddle.static.data(name="x", shape=[10, 10], dtype='float64')
+            y = paddle.poisson(x)
+
+            exe = paddle.static.Executor()
+            y_np = exe.run(paddle.static.default_main_program(),
+                           feed={"x": x_np},
+                           fetch_list=[y])
+            self.assertTrue(np.min(y_np) >= 0)
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.randn([10, 10], dtype='float32')
+        y = paddle.poisson(x)
+        self.assertTrue(np.min(y.numpy()) >= 0)
+        paddle.enable_static()
+
+    # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+    def test_fixed_random_number(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+        x = paddle.full([32, 3, 1024, 768], 10., dtype="float32")
+        y = paddle.poisson(x)
+        y_np = y.numpy()
+
+        expect = [
+            13., 13., 11., 8., 12., 6., 9., 15., 16., 6., 13., 12., 9., 15.,
+            17., 8., 11., 16., 11., 10.
+        ]
+        self.assertTrue(np.array_equal(y_np[0, 0, 0, 0:20], expect))
+
+        expect = [
+            15., 7., 12., 8., 14., 10., 10., 11., 11., 11., 21., 6., 9., 13.,
+            13., 11., 6., 9., 12., 12.
+        ]
+        self.assertTrue(np.array_equal(y_np[8, 1, 300, 200:220], expect))
+
+        expect = [
+            10., 15., 9., 6., 4., 13., 10., 10., 13., 12., 9., 7., 10., 14., 7.,
+            10., 8., 5., 10., 14.
+        ]
+        self.assertTrue(np.array_equal(y_np[16, 1, 600, 400:420], expect))
+
+        expect = [
+            10., 9., 14., 12., 8., 9., 7., 8., 11., 10., 13., 8., 12., 9., 7.,
+            8., 11., 11., 12., 5.
+        ]
+        self.assertTrue(np.array_equal(y_np[24, 2, 900, 600:620], expect))
+
+        expect = [
+            15., 5., 11., 13., 12., 12., 13., 16., 9., 9., 7., 9., 13., 11.,
+            15., 6., 11., 9., 10., 10.
+        ]
+        self.assertTrue(np.array_equal(y_np[31, 2, 1023, 748:768], expect))
+
+        x = paddle.full([16, 1024, 1024], 5., dtype="float32")
+        y = paddle.poisson(x)
+        y_np = y.numpy()
+        expect = [
+            4., 5., 2., 9., 8., 7., 4., 7., 4., 7., 6., 3., 10., 7., 5., 7., 2.,
+            5., 5., 6.
+        ]
+        self.assertTrue(np.array_equal(y_np[0, 0, 100:120], expect))
+
+        expect = [
+            1., 4., 8., 11., 6., 5., 4., 4., 7., 4., 4., 7., 11., 6., 5., 3.,
+            4., 6., 3., 3.
+        ]
+        self.assertTrue(np.array_equal(y_np[4, 300, 300:320], expect))
+
+        expect = [
+            7., 5., 4., 6., 8., 5., 6., 7., 7., 7., 3., 10., 5., 10., 4., 5.,
+            8., 7., 5., 7.
+        ]
+        self.assertTrue(np.array_equal(y_np[8, 600, 600:620], expect))
+
+        expect = [
+            8., 6., 7., 4., 3., 0., 4., 6., 6., 4., 3., 10., 5., 1., 3., 8., 8.,
+            2., 1., 4.
+        ]
+        self.assertTrue(np.array_equal(y_np[12, 900, 900:920], expect))
+
+        expect = [
+            2., 1., 14., 3., 6., 5., 2., 2., 6., 5., 7., 4., 8., 4., 8., 4., 5.,
+            7., 1., 7.
+        ]
+        self.assertTrue(np.array_equal(y_np[15, 1023, 1000:1020], expect))
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py b/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py
new file mode 100644
index 0000000000000..07354f1b7b25b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py
@@ -0,0 +1,58 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from paddle.fluid.layers.nn import _pull_gpups_sparse
+
+paddle.enable_static()
+
+
+class TestPullGpupsSparse(unittest.TestCase):
+    """Test PullGpupsSparse op."""
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        slots = []
+        with fluid.program_guard(train_program, startup_program):
+
+            l = fluid.layers.data(
+                name='input', shape=[1], dtype="int64", lod_level=1)
+            slots.append(l)
+            output = _pull_gpups_sparse(
+                slots, size=[11], is_distributed=True, is_sparse=True)
+            cost = paddle.fluid.layers.mean(output)
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd_optimizer.minimize(cost, train_program)
+            block = train_program.global_block()
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+            img = np.array([1]).astype(np.int64)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
new file mode 100644
index 0000000000000..7a7c2987f3b51
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py
@@ -0,0 +1,191 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import copy
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+from paddle.fluid.dygraph.base import switch_to_static_graph
+
+paddle.enable_static()
+
+
+class TestPutAlongAxisOp(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "assign"
+        self.dtype = 'float64'
+        self.op_type = "put_along_axis"
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace opearion.
+        self.xnp_result = copy.deepcopy(self.xnp)
+        np.put_along_axis(self.xnp_result, self.index, self.value, self.axis)
+        self.target = self.xnp_result
+        broadcast_shape_list = list(self.x_shape)
+        broadcast_shape_list[self.axis] = 1
+        self.braodcast_shape = tuple(broadcast_shape_list)
+        self.index_broadcast = np.broadcast_to(self.index, self.braodcast_shape)
+        self.value_broadcast = np.broadcast_to(self.value, self.braodcast_shape)
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index_broadcast,
+            'Value': self.value_broadcast
+        }
+        self.attrs = {'Axis': self.axis, 'Reduce': self.reduce_op}
+        self.outputs = {'Result': self.target}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Input", "Value"], "Result")
+
+    def init_data(self):
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.array([99]).astype(self.value_type)
+        self.index_type = "int32"
+        self.index = np.array([[[0]]]).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.shape = [1, 3]
+        self.index_shape = [1, 1]
+        self.index_np = np.array([[0]]).astype('int64')
+        self.x_np = np.random.random(self.shape).astype(np.float32)
+        self.place = [paddle.CPUPlace()]
+        self.axis = 0
+        self.value_np = 99.0
+        self.value_shape = [1]
+        self.x_feed = copy.deepcopy(self.x_np)
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def test_api_static_case1(self):
+        paddle.enable_static()
+
+        def run(place):
+            with paddle.static.program_guard(paddle.static.Program()):
+                x = paddle.fluid.data('X', self.shape)
+                index = paddle.fluid.data('Index', self.index_shape, "int64")
+                value = paddle.fluid.data('Value', self.value_shape)
+                out = paddle.put_along_axis(x, index, value, self.axis)
+                exe = paddle.static.Executor(self.place[0])
+                res = exe.run(feed={
+                    'X': self.x_feed,
+                    'Value': self.value_np,
+                    'Index': self.index_np
+                },
+                              fetch_list=[out])
+
+            np.put_along_axis(self.x_np, self.index_np, self.value_np,
+                              self.axis)
+            # numpy put_along_axis is an inplace opearion.
+            out_ref = self.x_np
+
+            for out in res:
+                self.assertEqual(np.allclose(out, out_ref, rtol=1e-03), True)
+
+        for place in self.place:
+            run(place)
+
+    def test_api_dygraph_case1(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.x_np)
+            index_tensor = paddle.to_tensor(self.index_np)
+            value_tensor = paddle.to_tensor(self.value_np)
+            out = paddle.put_along_axis(x_tensor, index_tensor, value_tensor,
+                                        self.axis)
+            np.array(
+                np.put_along_axis(self.x_np, self.index_np, self.value_np,
+                                  self.axis))
+            out_ref = self.x_np
+            self.assertEqual(
+                np.allclose(
+                    out.numpy(), out_ref, rtol=1e-03), True)
+
+            # for ci coverage, numpy put_along_axis did not support argument of 'reduce'
+            paddle.put_along_axis(x_tensor, index_tensor, value_tensor,
+                                  self.axis, 'mul')
+            paddle.put_along_axis(x_tensor, index_tensor, value_tensor,
+                                  self.axis, 'add')
+
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+    def test_api_dygraph_case2(self):
+        def run(place):
+            paddle.disable_static(place)
+            self.shape = [2, 2]
+            self.index_shape = [2, 2]
+            self.index_np = np.array([[0, 0], [1, 0]]).astype('int64')
+            self.x_np = np.random.random(self.shape).astype(np.float32)
+
+            x_tensor = paddle.to_tensor(self.x_np)
+            index_tensor = paddle.to_tensor(self.index_np)
+            value_tensor = paddle.to_tensor(self.value_np)
+            out = paddle.put_along_axis(x_tensor, index_tensor, value_tensor,
+                                        self.axis)
+            np.array(
+                np.put_along_axis(self.x_np, self.index_np, self.value_np,
+                                  self.axis))
+            out_ref = self.x_np
+            self.assertEqual(
+                np.allclose(
+                    out.numpy(), out_ref, rtol=1e-03), True)
+
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+    def test_inplace_dygraph_case3(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.x_np)
+            index_tensor = paddle.to_tensor(self.index_np)
+            value_tensor = paddle.to_tensor(self.value_np)
+
+            x_tensor.put_along_axis_(index_tensor, value_tensor, self.axis)
+
+            np.array(
+                np.put_along_axis(self.x_np, self.index_np, self.value_np,
+                                  self.axis))
+            out_ref = self.x_np
+
+            self.assertEqual(
+                np.allclose(
+                    x_tensor.numpy(), out_ref, rtol=1e-03), True)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_quantile.py b/python/paddle/fluid/tests/unittests/test_quantile.py
new file mode 100644
index 0000000000000..0fd3c1de9ca82
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_quantile.py
@@ -0,0 +1,150 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+
+
+class TestQuantile(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(678)
+        self.input_data = np.random.rand(6, 7, 8, 9, 10)
+
+    def test_quantile_single_q(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(x, q=0.5, axis=2)
+        np_res = np.quantile(self.input_data, q=0.5, axis=2)
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+    def test_quantile_with_no_axis(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(x, q=0.35)
+        np_res = np.quantile(self.input_data, q=0.35)
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+    def test_quantile_with_multi_axis(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(x, q=0.75, axis=[0, 2, 3])
+        np_res = np.quantile(self.input_data, q=0.75, axis=[0, 2, 3])
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+    def test_quantile_with_keepdim(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(x, q=0.35, axis=4, keepdim=True)
+        np_res = np.quantile(self.input_data, q=0.35, axis=4, keepdims=True)
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+    def test_quantile_with_keepdim_and_multiple_axis(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(x, q=0.1, axis=[1, 4], keepdim=True)
+        np_res = np.quantile(self.input_data, q=0.1, axis=[1, 4], keepdims=True)
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+    def test_quantile_with_boundary_q(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(x, q=0, axis=3)
+        np_res = np.quantile(self.input_data, q=0, axis=3)
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+    def test_quantile_include_NaN(self):
+        input_data = np.random.randn(2, 3, 4)
+        input_data[0, 1, 1] = np.nan
+        x = paddle.to_tensor(input_data)
+        paddle_res = paddle.quantile(x, q=0.35, axis=0)
+        self.assertTrue(paddle.isnan(paddle_res[1, 1]))
+
+
+class TestQuantileMuitlpleQ(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(678)
+        self.input_data = np.random.rand(10, 3, 4, 5, 4)
+
+    def test_quantile(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(x, q=[0.3, 0.44], axis=-2)
+        np_res = np.quantile(self.input_data, q=[0.3, 0.44], axis=-2)
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+    def test_quantile_multiple_axis(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(x, q=[0.2, 0.67], axis=[1, -1])
+        np_res = np.quantile(self.input_data, q=[0.2, 0.67], axis=[1, -1])
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+    def test_quantile_multiple_axis_keepdim(self):
+        x = paddle.to_tensor(self.input_data)
+        paddle_res = paddle.quantile(
+            x, q=[0.1, 0.2, 0.3], axis=[1, 2], keepdim=True)
+        np_res = np.quantile(
+            self.input_data, q=[0.1, 0.2, 0.3], axis=[1, 2], keepdims=True)
+        self.assertTrue(np.allclose(paddle_res.numpy(), np_res))
+
+
+class TestQuantileError(unittest.TestCase):
+    def setUp(self):
+        self.x = paddle.randn((2, 3, 4))
+
+    def test_errors(self):
+        def test_q_range_error_1():
+            paddle_res = paddle.quantile(self.x, q=1.5)
+
+        self.assertRaises(ValueError, test_q_range_error_1)
+
+        def test_q_range_error_2():
+            paddle_res = paddle.quantile(self.x, q=[0.2, -0.3])
+
+        self.assertRaises(ValueError, test_q_range_error_2)
+
+        def test_q_range_error_3():
+            paddle_res = paddle.quantile(self.x, q=[])
+
+        self.assertRaises(ValueError, test_q_range_error_3)
+
+        def test_x_type_error():
+            x = [1, 3, 4]
+            paddle_res = paddle.quantile(x, q=0.9)
+
+        self.assertRaises(TypeError, test_x_type_error)
+
+        def test_axis_type_error_1():
+            paddle_res = paddle.quantile(self.x, q=0.4, axis=0.4)
+
+        self.assertRaises(ValueError, test_axis_type_error_1)
+
+        def test_axis_type_error_2():
+            paddle_res = paddle.quantile(self.x, q=0.4, axis=[1, 0.4])
+
+        self.assertRaises(ValueError, test_axis_type_error_2)
+
+        def test_axis_value_error_1():
+            paddle_res = paddle.quantile(self.x, q=0.4, axis=10)
+
+        self.assertRaises(ValueError, test_axis_value_error_1)
+
+        def test_axis_value_error_2():
+            paddle_res = paddle.quantile(self.x, q=0.4, axis=[1, -10])
+
+        self.assertRaises(ValueError, test_axis_value_error_2)
+
+        def test_axis_value_error_3():
+            paddle_res = paddle.quantile(self.x, q=0.4, axis=[])
+
+        self.assertRaises(ValueError, test_axis_value_error_3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_renorm_op.py b/python/paddle/fluid/tests/unittests/test_renorm_op.py
new file mode 100644
index 0000000000000..3ea2002a9786f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_renorm_op.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+paddle.set_device('cpu')
+
+
+class TestRenormAPI(unittest.TestCase):
+    def input_data(self):
+        self.data_x = np.array(
+            [[[2.0, 2, -2], [3, 0.3, 3]], [[2, -8, 2], [3.1, 3.7, 3]]])
+        self.p = 1.0
+        self.dim = 2
+        self.max_norm = 2.05
+
+    def test_renorm_api(self):
+        paddle.enable_static()
+        self.input_data()
+
+        # case 1:
+        with program_guard(Program(), Program()):
+            #x = fluid.layers.data(name = 'x',shape=[-1, 2, 3])
+            x = paddle.static.data(name="x", shape=[-1, 2, 3], dtype='float64')
+            z = paddle.renorm(x, self.p, self.dim, self.max_norm)
+            exe = fluid.Executor(fluid.CPUPlace())
+            res, = exe.run(feed={"x": self.data_x},
+                           fetch_list=[z],
+                           return_numpy=False)
+        expected = np.array([[[0.40594056, 0.29285714, -0.41000000],
+                              [0.60891086, 0.04392857, 0.61500001]],
+                             [[0.40594056, -1.17142856, 0.41000000],
+                              [0.62920785, 0.54178572, 0.61500001]]])
+        self.assertTrue(np.allclose(expected, np.array(res)))
+
+    def test_dygraph_api(self):
+        self.input_data()
+        # case axis none
+        with fluid.dygraph.guard():
+            input = [[[2.0, 2, -2], [3, 0.3, 3]], [[2, -8, 2], [3.1, 3.7, 3]]]
+            x = paddle.to_tensor(input, stop_gradient=False)
+            y = paddle.renorm(x, 1.0, 2, 2.05)
+            expected = np.array([[[0.40594056, 0.29285714, -0.41000000],
+                                  [0.60891086, 0.04392857, 0.61500001]],
+                                 [[0.40594056, -1.17142856, 0.41000000],
+                                  [0.62920785, 0.54178572, 0.61500001]]])
+            self.assertTrue(np.allclose(expected, np.array(y)))
+            z = paddle.mean(y)
+            z.backward(retain_graph=True)
+            expected_grad = np.array(
+                [[[0, 0.01394558, 0.02733333], [0, 0.01394558, 0.00683333]],
+                 [[0, 0.01045918, 0.00683333], [0, 0.01394558, 0.00683333]]])
+            self.assertTrue(np.allclose(expected_grad, np.array(x.grad)))
+        #test exception:
+        with fluid.dygraph.guard():
+            input = [[[2.0, 2, -2], [3, 0.3, 3]], [[2, -8, 2], [3.1, 3.7, 3]]]
+            x = paddle.to_tensor(input, stop_gradient=False)
+            exp = False
+            try:
+                paddle.renorm(x, 1.0, 8, 2.05)
+            except:
+                exp = True
+            self.assertTrue(exp)
+            exp = False
+            try:
+                paddle.renorm(x, 1.0, -4, 2.05)
+            except:
+                exp = True
+            self.assertTrue(exp)
+            y = paddle.renorm(x, 1.0, -1, 2.05)
+            expected = np.array([[[0.40594056, 0.29285714, -0.41000000],
+                                  [0.60891086, 0.04392857, 0.61500001]],
+                                 [[0.40594056, -1.17142856, 0.41000000],
+                                  [0.62920785, 0.54178572, 0.61500001]]])
+            self.assertTrue(np.allclose(expected, np.array(y)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index f4636ab3d1147..42225468bc41c 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -1330,6 +1330,24 @@ def set_value(array, i, op):
             array = array[0]
 
 
+class TestSetValueInplace(unittest.TestCase):
+    def test_inplace(self):
+        paddle.disable_static()
+        with paddle.fluid.dygraph.guard():
+            paddle.seed(100)
+            a = paddle.rand(shape=[1, 4])
+            a.stop_gradient = False
+            b = a[:]
+            c = b
+            b[paddle.to_tensor(0)] = 1.0
+
+            self.assertTrue(id(b) == id(c))
+            self.assertTrue(np.array_equal(b.numpy(), c.numpy()))
+            self.assertEqual(b.inplace_version, 1)
+
+        paddle.enable_static()
+
+
 class TestSetValueInplaceLeafVar(unittest.TestCase):
     def test_inplace_var_become_leaf_var(self):
         paddle.disable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index bfaf694d9b4e3..817150a21f5e5 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -192,6 +192,7 @@ def test_sparse_parameter_sgd(self):
 
 class TestSGDOpWithLargeInput(unittest.TestCase):
     def runTest(self):
+        paddle.enable_static()
         data = fluid.layers.fill_constant(shape=[1], value=128, dtype='int64')
         label = fluid.layers.fill_constant(
             shape=[1, 150], value=0.5, dtype='float32')
@@ -291,5 +292,212 @@ def test_sgd_group_dygraph(self):
         adam.clear_gradients()
 
 
+class TestSGDMultiPrecision2_0(unittest.TestCase):
+    def dygraph_sgd_mp(self, mp):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device('gpu')
+        input = paddle.randn((2, 2))
+        model = paddle.nn.Linear(2, 2)
+        optimizer = paddle.optimizer.SGD(parameters=model.parameters(),
+                                         multi_precision=mp)
+        if mp == True:
+            model = paddle.amp.decorate(models=model, level='O2')
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        for idx in range(5):
+            if mp == True:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.minimize(optimizer, scaled)
+                optimizer.clear_grad()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                optimizer.step()
+                optimizer.clear_grad()
+
+        return output, model.parameters()
+
+    def static_sgd_mp(self, mp):
+        paddle.enable_static()
+        paddle.seed(10)
+        np.random.seed(10)
+        exe = paddle.static.Executor('gpu')
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        optimizer = paddle.optimizer.SGD(multi_precision=mp)
+
+        if mp:
+            optimizer = paddle.static.amp.decorate(
+                optimizer,
+                init_loss_scaling=128.0,
+                use_dynamic_loss_scaling=True,
+                use_pure_fp16=True,
+                use_fp16_guard=False)
+        with paddle.static.program_guard(train_program, startup_program):
+            if mp:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16')
+            else:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32')
+            hidden = paddle.static.nn.fc(x=data, size=10)
+            loss = paddle.fluid.layers.mean(hidden)
+            optimizer.minimize(loss)
+        exe.run(startup_program)
+
+        if mp:
+            optimizer.amp_init(place='gpu', scope=paddle.static.global_scope())
+            x = np.random.random(size=(2, 2)).astype('float16')
+        else:
+            x = np.random.random(size=(2, 2)).astype('float32')
+        out = []
+        for idx in range(5):
+            loss_data, = exe.run(train_program,
+                                 feed={"X": x},
+                                 fetch_list=[loss.name])
+            out.append(loss_data)
+        return out
+
+    def test_main(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        "Test dygraph mode"
+        output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True)
+        output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False)
+        self.assertEqual(
+            np.allclose(
+                output1_dy.astype('float32').numpy(),
+                output2_dy.astype('float32').numpy(),
+                atol=1e-01),
+            True)
+        for idx in range(len(params1_dy)):
+            self.assertEqual(
+                np.allclose(
+                    params1_dy[idx].astype('float32').numpy(),
+                    params2_dy[idx].astype('float32').numpy(),
+                    atol=1e-01),
+                True)
+        "Test static mode"
+        output1_st = self.static_sgd_mp(mp=True)
+        output2_st = self.static_sgd_mp(mp=False)
+        for idx in range(len(output1_st)):
+            self.assertEqual(
+                np.allclose(
+                    output1_st[idx].astype('float32'),
+                    output2_st[idx].astype('float32'),
+                    atol=1e-01),
+                True)
+
+
+class TestSGDMultiPrecision1_0(unittest.TestCase):
+    def dygraph_sgd_mp(self, mp):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device('gpu')
+        input = paddle.randn((2, 2))
+        model = paddle.nn.Linear(2, 2)
+        optimizer = paddle.fluid.optimizer.SGD(
+            learning_rate=0.001,
+            parameter_list=model.parameters(),
+            multi_precision=mp)
+        if mp == True:
+            model = paddle.amp.decorate(models=model, level='O2')
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        for idx in range(5):
+            if mp == True:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.minimize(optimizer, scaled)
+                optimizer.clear_gradients()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                optimizer.minimize(loss)
+                optimizer.clear_gradients()
+
+        return output, model.parameters()
+
+    def static_sgd_mp(self, mp):
+        paddle.enable_static()
+        paddle.seed(10)
+        np.random.seed(10)
+        exe = paddle.static.Executor('gpu')
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001,
+                                               multi_precision=mp)
+
+        if mp:
+            optimizer = paddle.static.amp.decorate(
+                optimizer,
+                init_loss_scaling=128.0,
+                use_dynamic_loss_scaling=True,
+                use_pure_fp16=True,
+                use_fp16_guard=False)
+        with paddle.static.program_guard(train_program, startup_program):
+            if mp:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16')
+            else:
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32')
+            hidden = paddle.static.nn.fc(x=data, size=10)
+            loss = paddle.fluid.layers.mean(hidden)
+            optimizer.minimize(loss)
+        exe.run(startup_program)
+
+        if mp:
+            optimizer.amp_init(place='gpu', scope=paddle.static.global_scope())
+            x = np.random.random(size=(2, 2)).astype('float16')
+        else:
+            x = np.random.random(size=(2, 2)).astype('float32')
+        out = []
+        for idx in range(5):
+            loss_data, = exe.run(train_program,
+                                 feed={"X": x},
+                                 fetch_list=[loss.name])
+            out.append(loss_data)
+        return out
+
+    def test_main(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        "Test dygraph mode"
+        output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True)
+        output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False)
+        self.assertEqual(
+            np.allclose(
+                output1_dy.astype('float32').numpy(),
+                output2_dy.astype('float32').numpy(),
+                atol=1e-01),
+            True)
+        for idx in range(len(params1_dy)):
+            self.assertEqual(
+                np.allclose(
+                    params1_dy[idx].astype('float32').numpy(),
+                    params2_dy[idx].astype('float32').numpy(),
+                    atol=1e-01),
+                True)
+        "Test static mode"
+        output1_st = self.static_sgd_mp(mp=True)
+        output2_st = self.static_sgd_mp(mp=False)
+        for idx in range(len(output1_st)):
+            self.assertEqual(
+                np.allclose(
+                    output1_st[idx].astype('float32'),
+                    output2_st[idx].astype('float32'),
+                    atol=1e-01),
+                True)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index b42520a6a1c13..eddccd4ff24f1 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -24,6 +24,7 @@
 from paddle.fluid.op import Operator
 from paddle.fluid.tests.unittests.op_test import (
     OpTest, convert_float_to_uint16, convert_uint16_to_float)
+from paddle import _C_ops
 
 
 class TestSumOp(OpTest):
@@ -382,11 +383,11 @@ class TestSumOpError(unittest.TestCase):
     def test_errors(self):
         def test_empty_list_input():
             with fluid.dygraph.guard():
-                fluid.core.ops.sum([])
+                fluid._C_ops.sum([])
 
         def test_list_of_none_input():
             with fluid.dygraph.guard():
-                fluid.core.ops.sum([None])
+                fluid._C_ops.sum([None])
 
         self.assertRaises(Exception, test_empty_list_input)
         self.assertRaises(Exception, test_list_of_none_input)
diff --git a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
new file mode 100644
index 0000000000000..97162eb9c706b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+from paddle.fluid.dygraph.base import switch_to_static_graph
+
+paddle.enable_static()
+
+
+class TestTakeAlongAxisOp(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "take_along_axis"
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.target = np.take_along_axis(self.xnp, self.index, self.axis)
+        broadcast_shape_list = list(self.x_shape)
+        broadcast_shape_list[self.axis] = 1
+        self.braodcast_shape = tuple(broadcast_shape_list)
+        self.index_broadcast = np.broadcast_to(self.index, self.braodcast_shape)
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index_broadcast,
+        }
+        self.attrs = {'Axis': self.axis}
+        self.outputs = {'Result': self.target}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input'], 'Result')
+
+    def init_data(self):
+        self.x_type = "float64"
+        self.x_shape = (5, 5, 5)
+        self.index_type = "int32"
+        self.index = np.array(
+            [[[1]], [[1]], [[2]], [[4]], [[3]]]).astype(self.index_type)
+        self.axis = 2
+        self.axis_type = "int64"
+
+
+class TestCase1(TestTakeAlongAxisOp):
+    def init_data(self):
+        self.x_type = "float64"
+        self.x_shape = (5, 5, 5)
+        self.index_type = "int32"
+        self.index = np.array([[[0, 1, 2, 1, 4]]]).astype(self.index_type)
+        self.axis = 0
+        self.axis_type = "int64"
+
+
+class TestTakeAlongAxisAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.shape = [3, 3]
+        self.index_shape = [1, 3]
+        self.index_np = np.array([[0, 1, 2]]).astype('int64')
+        self.x_np = np.random.random(self.shape).astype(np.float32)
+        self.place = [paddle.CPUPlace()]
+        self.axis = 0
+        if core.is_compiled_with_cuda():
+            self.place.append(paddle.CUDAPlace(0))
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', self.shape)
+            index = paddle.fluid.data('Index', self.index_shape, "int64")
+            out = paddle.take_along_axis(x, index, self.axis)
+            exe = paddle.static.Executor(self.place[0])
+            res = exe.run(feed={'X': self.x_np,
+                                'Index': self.index_np},
+                          fetch_list=[out])
+        out_ref = np.array(
+            np.take_along_axis(self.x_np, self.index_np, self.axis))
+        for out in res:
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-03), True)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place[0])
+        x_tensor = paddle.to_tensor(self.x_np)
+        self.index = paddle.to_tensor(self.index_np)
+        out = paddle.take_along_axis(x_tensor, self.index, self.axis)
+        out_ref = np.array(
+            np.take_along_axis(self.x_np, self.index_np, self.axis))
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-03), True)
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
index 49cf1bf65c3ed..0f64f7f5d8d10 100644
--- a/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transfer_layout_op.py
@@ -30,7 +30,7 @@ def setUp(self):
         self.inputs = {'X': ipt.astype('float32')}
         self.outputs = {'Out': ipt.transpose([0, 2, 3, 1])}
         self.attrs = {
-            'dst_layout': 0  # kNHWC
+            'dst_layout': 1  # kNHWC
         }
         self.op_type = 'transfer_layout'
 
diff --git a/python/paddle/fluid/tests/unittests/test_unpool1d_op.py b/python/paddle/fluid/tests/unittests/test_unpool1d_op.py
new file mode 100644
index 0000000000000..95d19210acb72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unpool1d_op.py
@@ -0,0 +1,156 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+
+paddle.enable_static()
+paddle.seed(2022)
+
+
+def _unpool_output_size(x, kernel_size, stride, padding, output_size):
+    input_size = x.shape
+    default_size = []
+    for d in range(len(kernel_size)):
+        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d]
+                            + kernel_size[d] - 2 * padding[d])
+    if output_size is None:
+        ret = default_size
+    else:
+        ret = output_size
+    return ret
+
+
+def unpool1dmax_forward_naive(input, indices, ksize, strides, paddings,
+                              output_size):
+    s0, s1, s2 = input.shape
+    output_size = _unpool_output_size(input, ksize, strides, paddings,
+                                      output_size)
+    out_lsize = output_size[0]
+    out = np.zeros((s0, s1, out_lsize))
+    for nidx in range(s0):
+        for cidx in range(s1):
+            for l in range(s2):
+                index = indices[nidx, cidx, l]
+                lidx = index % out_lsize
+                out[nidx, cidx, lidx] = input[nidx, cidx, l]
+
+    return out
+
+
+class TestUnpool1DOpAPI_dygraph(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 16)
+            input_x = paddle.to_tensor(input_data)
+            output, indices = F.max_pool1d(
+                input_x, kernel_size=2, stride=2, return_mask=True)
+            output_unpool = F.max_unpool1d(
+                output, indices, kernel_size=2, stride=2)
+            expected_output_unpool = unpool1dmax_forward_naive(
+                output.numpy(), indices.numpy(), [2], [2], [0], [16])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool1DOpAPI_dygraph2(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 16)
+            input_x = paddle.to_tensor(input_data)
+            output, indices = F.max_pool1d(
+                input_x, kernel_size=2, stride=2, return_mask=True)
+            output_unpool = F.max_unpool1d(
+                output, indices, kernel_size=2, stride=None)
+            expected_output_unpool = unpool1dmax_forward_naive(
+                output.numpy(), indices.numpy(), [2], [2], [0], [16])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool1DOpAPI_dygraph3(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 16)
+            input_x = paddle.to_tensor(input_data)
+            Pool1d = paddle.nn.MaxPool1D(
+                kernel_size=2, stride=2, return_mask=True)
+            UnPool1d = paddle.nn.MaxUnPool1D(kernel_size=2, stride=2)
+
+            output, indices = Pool1d(input_x)
+            output_unpool = UnPool1d(output, indices)
+            expected_output_unpool = unpool1dmax_forward_naive(
+                output.numpy(), indices.numpy(), [2], [2], [0], [16])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool1DOpAPI_static(unittest.TestCase):
+    def test_case(self):
+        paddle.enable_static()
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+
+                input_data = np.array([[[1, 2, 3, 4], [5, 6, 7, 8],
+                                        [9, 10, 11, 12]]]).astype("float32")
+                x = paddle.fluid.data(
+                    name='x', shape=[1, 3, 4], dtype='float32')
+                output, indices = F.max_pool1d(
+                    x, kernel_size=2, stride=2, return_mask=True)
+                output_unpool = F.max_unpool1d(
+                    output, indices, kernel_size=2, stride=None)
+
+                exe = paddle.fluid.Executor(place)
+                fetches = exe.run(paddle.fluid.default_main_program(),
+                                  feed={"x": input_data},
+                                  fetch_list=[output_unpool],
+                                  return_numpy=True)
+                pool1d_out_np = np.array(
+                    [[[2., 4.], [6., 8.], [10., 12.]]]).astype("float32")
+                indices_np = np.array(
+                    [[[1, 3], [1, 3], [1, 3]]]).astype("int32")
+                expected_output_unpool = unpool1dmax_forward_naive(
+                    pool1d_out_np, indices_np, [2], [2], [0], [4])
+                self.assertTrue(np.allclose(fetches[0], expected_output_unpool))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unpool3d_op.py b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
new file mode 100644
index 0000000000000..e6031d9cee8b1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_unpool3d_op.py
@@ -0,0 +1,293 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+
+paddle.enable_static()
+paddle.seed(2022)
+
+
+def _unpool_output_size(x, kernel_size, stride, padding, output_size):
+    input_size = x.shape
+    default_size = []
+    for d in range(len(kernel_size)):
+        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d]
+                            + kernel_size[d] - 2 * padding[d])
+    if output_size is None:
+        ret = default_size
+    else:
+        ret = output_size
+    return ret
+
+
+def unpool3dmax_forward_naive(input, indices, ksize, strides, paddings,
+                              output_size):
+    s0, s1, s2, s3, s4 = input.shape
+    output_size = _unpool_output_size(input, ksize, strides, paddings,
+                                      output_size)
+    out_dsize = output_size[0]
+    out_hsize = output_size[1]
+    out_wsize = output_size[2]
+    out = np.zeros((s0, s1, out_dsize, out_hsize, out_wsize))
+    for nidx in range(s0):
+        for cidx in range(s1):
+            for d in range(s2):
+                for h in range(s3):
+                    for w in range(s4):
+                        index = indices[nidx, cidx, d, h, w]
+                        didx = index // (out_wsize * out_hsize)
+                        hidx = (
+                            index - didx * out_hsize * out_wsize) // out_wsize
+                        widx = (
+                            index - didx * out_hsize * out_wsize) % out_wsize
+                        out[nidx, cidx, didx, hidx, widx] = \
+                                input[nidx, cidx, d, h, w]
+
+    return out
+
+
+class TestUnpool3DOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool3d"
+        self.init_test_case()
+        inputs = np.random.randint(0, 100, self.shape)
+        nsize, csize, dsize, hsize, wsize = inputs.shape
+        self.output_size = _unpool_output_size(inputs, self.ksize, self.strides,
+                                               self.paddings, self.output_size)
+        indices = np.random.permutation(
+            np.arange(0, self.output_size[0] * self.output_size[1] *
+                      self.output_size[2]))[:dsize * hsize * wsize]
+        indices = np.reshape(indices, [dsize, hsize, wsize])
+        idx_list = []
+        for n in range(nsize):
+            c_list = []
+            for c in range(csize):
+                c_list.append(indices.tolist())
+            idx_list.append(c_list)
+        indices = np.array(idx_list)
+
+        output = self.unpool3d_forward_naive(inputs, indices, self.ksize, \
+                self.strides, self.paddings, self.output_size).astype("float64")
+
+        self.inputs = {
+            'X': inputs.astype('float64'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+            'output_size': self.output_size,
+        }
+        self.outputs = {'Out': output.astype('float64')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool3d_forward_naive = unpool3dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [1, 1, 4, 5, 6]
+        self.ksize = [2, 2, 2]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+        self.output_size = None
+
+
+class TestUnpool3DOpcase1(TestUnpool3DOp):
+    def init_test_case(self):
+        self.unpool3d_forward_naive = unpool3dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [1, 3, 4, 5, 6]
+        self.ksize = [2, 2, 2]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+        self.output_size = None
+
+
+class TestUnpool3DOpOutput(TestUnpool3DOp):
+    def init_test_case(self):
+        self.unpool3d_forward_naive = unpool3dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [1, 3, 4, 5, 6]
+        self.ksize = [2, 2, 2]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+        self.output_size = [7, 9, 11]
+
+
+class TestUnpool3DOpException(unittest.TestCase):
+    def test_exception(self):
+        def indices_size_error():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 36), shape=[1, 1, 3, 3, 4])
+            MaxUnPool3D = F.maxunpool3d(data, indices, kernel_size=2, stride=2)
+
+        def indices_value_error():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(4, 40), shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(data, indices, kernel_size=2, stride=2)
+
+        def data_format_error():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(
+                data, indices, kernel_size=2, stride=2, data_format="NDHWC")
+
+        def data_outputsize_error():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(
+                data,
+                indices,
+                kernel_size=2,
+                stride=2,
+                output_size=[2, 2, 3, 4, 5])
+
+        def data_outputsize_error2():
+            data = paddle.randint(shape=[1, 1, 3, 3, 3])
+            indices = paddle.reshape(
+                paddle.arange(0, 27), shape=[1, 1, 3, 3, 3])
+            MaxUnPool3D = F.maxunpool3d(
+                data,
+                indices,
+                kernel_size=2,
+                stride=2,
+                output_size=[10, 10, 10])
+
+        self.assertRaises(ValueError, indices_size_error)
+        self.assertRaises(ValueError, indices_value_error)
+        self.assertRaises(ValueError, data_format_error)
+        self.assertRaises(ValueError, data_outputsize_error)
+        self.assertRaises(ValueError, data_outputsize_error2)
+
+
+class TestUnpool3DOpAPI_dygraph(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 4, 4, 6)
+            input_x = paddle.to_tensor(input_data)
+            output, indices = F.max_pool3d(
+                input_x, kernel_size=2, stride=2, return_mask=True)
+            output_unpool = F.max_unpool3d(
+                output, indices, kernel_size=2, stride=2)
+            expected_output_unpool = unpool3dmax_forward_naive(
+                output.numpy(),
+                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool3DOpAPI_dygraph2(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 4, 4, 6)
+            input_x = paddle.to_tensor(input_data)
+            output, indices = F.max_pool3d(
+                input_x, kernel_size=2, stride=2, return_mask=True)
+            output_unpool = F.max_unpool3d(
+                output, indices, kernel_size=2, stride=None)
+            expected_output_unpool = unpool3dmax_forward_naive(
+                output.numpy(),
+                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool3DOpAPI_dygraph3(unittest.TestCase):
+    def test_case(self):
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            paddle.disable_static()
+            input_data = np.random.rand(1, 3, 4, 4, 6)
+            input_x = paddle.to_tensor(input_data)
+            Pool3d = paddle.nn.MaxPool3D(
+                kernel_size=2, stride=2, return_mask=True)
+            UnPool3d = paddle.nn.MaxUnPool3D(kernel_size=2, stride=2)
+
+            output, indices = Pool3d(input_x)
+            output_unpool = UnPool3d(output, indices)
+            expected_output_unpool = unpool3dmax_forward_naive(
+                output.numpy(),
+                indices.numpy(), [2, 2, 2], [2, 2, 2], [0, 0, 0], [4, 4, 6])
+            self.assertTrue(
+                np.allclose(output_unpool.numpy(), expected_output_unpool))
+
+        paddle.enable_static()
+
+
+class TestUnpool3DOpAPI_static(unittest.TestCase):
+    def test_case(self):
+        paddle.enable_static()
+        places = [paddle.CPUPlace()]
+        if paddle.fluid.core.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        for place in places:
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+
+                input_data = np.array([[[[[1, 2, 3, 4], [5, 6, 7, 8], \
+                    [9, 10, 11, 12], [13, 14, 15, 16]], [[1, 2, 3, 4], [5, 6, 7, 8], \
+                    [9, 10, 11, 12], [13, 14, 15, 16]]]]]).astype("float32")
+                x = paddle.fluid.data(
+                    name='x', shape=[1, 1, 2, 4, 4], dtype='float32')
+                output, indices = F.max_pool3d(
+                    x, kernel_size=2, stride=2, return_mask=True)
+                output_unpool = F.max_unpool3d(
+                    output, indices, kernel_size=2, stride=None)
+
+                exe = paddle.fluid.Executor(place)
+                fetches = exe.run(paddle.fluid.default_main_program(),
+                                  feed={"x": input_data},
+                                  fetch_list=[output_unpool],
+                                  return_numpy=True)
+                pool3d_out_np = np.array(
+                    [[[[[6., 8.], [14., 16.]]]]]).astype("float32")
+                indices_np = np.array([[[[[5, 7], [13, 15]]]]]).astype("int32")
+                expected_output_unpool = unpool3dmax_forward_naive(
+                    pool3d_out_np, indices_np, [2, 2, 2], [2, 2, 2], [0, 0, 0],
+                    [2, 4, 4])
+                self.assertTrue(np.allclose(fetches[0], expected_output_unpool))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index ab6e8003833ec..c4c4edbbb9335 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -497,6 +497,41 @@ def test_to_string(self):
             var = fluid.dygraph.to_variable(self.array)
             self.assertTrue(isinstance(str(var), str))
 
+    def test_element_size(self):
+        with fluid.dygraph.guard():
+            x = paddle.to_tensor(1, dtype='bool')
+            self.assertEqual(x.element_size(), 1)
+
+            x = paddle.to_tensor(1, dtype='float16')
+            self.assertEqual(x.element_size(), 2)
+
+            x = paddle.to_tensor(1, dtype='float32')
+            self.assertEqual(x.element_size(), 4)
+
+            x = paddle.to_tensor(1, dtype='float64')
+            self.assertEqual(x.element_size(), 8)
+
+            x = paddle.to_tensor(1, dtype='int8')
+            self.assertEqual(x.element_size(), 1)
+
+            x = paddle.to_tensor(1, dtype='int16')
+            self.assertEqual(x.element_size(), 2)
+
+            x = paddle.to_tensor(1, dtype='int32')
+            self.assertEqual(x.element_size(), 4)
+
+            x = paddle.to_tensor(1, dtype='int64')
+            self.assertEqual(x.element_size(), 8)
+
+            x = paddle.to_tensor(1, dtype='uint8')
+            self.assertEqual(x.element_size(), 1)
+
+            x = paddle.to_tensor(1, dtype='complex64')
+            self.assertEqual(x.element_size(), 8)
+
+            x = paddle.to_tensor(1, dtype='complex128')
+            self.assertEqual(x.element_size(), 16)
+
     def test_backward(self):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array)
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 2eb3ecf710481..5ba54daa0d4cb 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -63,6 +63,35 @@ def test_var(self):
         self.assertRaises(ValueError,
                           lambda: b.create_var(name="fc.w", shape=(24, 100)))
 
+    def test_element_size(self):
+        with fluid.program_guard(Program(), Program()):
+            x = paddle.static.data(name='x1', shape=[2], dtype='bool')
+            self.assertEqual(x.element_size(), 1)
+
+            x = paddle.static.data(name='x2', shape=[2], dtype='float16')
+            self.assertEqual(x.element_size(), 2)
+
+            x = paddle.static.data(name='x3', shape=[2], dtype='float32')
+            self.assertEqual(x.element_size(), 4)
+
+            x = paddle.static.data(name='x4', shape=[2], dtype='float64')
+            self.assertEqual(x.element_size(), 8)
+
+            x = paddle.static.data(name='x5', shape=[2], dtype='int8')
+            self.assertEqual(x.element_size(), 1)
+
+            x = paddle.static.data(name='x6', shape=[2], dtype='int16')
+            self.assertEqual(x.element_size(), 2)
+
+            x = paddle.static.data(name='x7', shape=[2], dtype='int32')
+            self.assertEqual(x.element_size(), 4)
+
+            x = paddle.static.data(name='x8', shape=[2], dtype='int64')
+            self.assertEqual(x.element_size(), 8)
+
+            x = paddle.static.data(name='x9', shape=[2], dtype='uint8')
+            self.assertEqual(x.element_size(), 1)
+
     def test_step_scopes(self):
         prog = Program()
         b = prog.current_block()
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index 1c8c89d13abc7..5deca1dc5acd4 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -49,6 +49,7 @@
     'sparse_attention', \
     'svd', \
     'matrix_power', \
+    'cholesky_solve', \
     'solve', \
 ]
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index c2c69be45bf30..ce82b20eca42d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -154,6 +154,11 @@ def setUp(self):
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
                  "core is not compiled with XPU")
@@ -334,6 +339,25 @@ def leaky_relu(x, alpha):
     return y_ref.astype(x.dtype)
 
 
+class TestXPUReciprocal(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "reciprocal"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(1, 2, [1111, 1117]).astype(self.dtype)
+        out = np.reciprocal(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {'use_xpu': True}
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
new file mode 100644
index 0000000000000..5c77d6304302c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
@@ -0,0 +1,237 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+from paddle.fluid import ParamAttr
+from paddle.fluid.framework import Program, grad_var_name
+from paddle.fluid.executor import Executor
+from paddle.fluid.backward import append_backward
+
+paddle.enable_static()
+
+
+class TestArgsortOp(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "argsort"
+        self.place = paddle.XPUPlace(0)
+        self.init_dtype()
+        self.init_inputshape()
+        self.init_axis()
+        self.init_direction()
+
+        self.x = np.random.random(self.input_shape).astype(self.dtype)
+        self.inputs = {"X": self.x}
+        self.attrs = {"axis": self.axis, "descending": self.descending}
+        self.get_output()
+        self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
+
+    def get_output(self):
+        if self.descending:
+            self.indices = np.flip(
+                np.argsort(
+                    self.x, kind='heapsort', axis=self.axis), self.axis)
+            self.sorted_x = np.flip(
+                np.sort(
+                    self.x, kind='heapsort', axis=self.axis), self.axis)
+        else:
+            self.indices = np.argsort(self.x, kind='heapsort', axis=self.axis)
+            self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_inputshape(self):
+        self.input_shape = (2, 2, 2, 3, 3)
+
+    def init_dtype(self):
+        self.dtype = 'float32'
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def init_direction(self):
+        self.descending = False
+
+
+class TestArgsortOpAxis0XPU(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestArgsortOpAxis1XPU(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestArgsortOpAxis2XPU(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 2
+
+
+class TestArgsortOpAxisNeg1XPU(TestArgsortOp):
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestArgsortOpAxisNeg2XPU(TestArgsortOp):
+    def init_axis(self):
+        self.axis = -2
+
+
+class TestArgsortOpDescendingAxisXPU(TestArgsortOp):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis0XPU(TestArgsortOpAxis0XPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis1XPU(TestArgsortOpAxis1XPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis2XPU(TestArgsortOpAxis2XPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisNeg1XPU(TestArgsortOpAxisNeg1XPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisNeg2XPU(TestArgsortOpAxisNeg2XPU):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpAxis0XPUINT64(TestArgsortOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "argsort"
+        self.place = paddle.XPUPlace(0)
+        self.init_dtype()
+        self.init_inputshape()
+        self.init_axis()
+        self.init_direction()
+
+        self.x = np.random.randint(
+            low=-1000, high=1000, size=self.input_shape).astype(self.dtype)
+        self.inputs = {"X": self.x}
+        self.attrs = {"axis": self.axis, "descending": self.descending}
+        self.get_output()
+        self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_dtype(self):
+        self.dtype = 'int64'
+
+
+class TestArgsortOpAxis1XPUINT64(TestArgsortOpAxis0XPUINT64):
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestArgsortOpAxis2XPUINT64(TestArgsortOpAxis0XPUINT64):
+    def init_axis(self):
+        self.axis = 2
+
+
+class TestArgsortOpAxisNeg1XPUINT64(TestArgsortOpAxis0XPUINT64):
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestArgsortOpAxisNeg2XPUINT64(TestArgsortOpAxis0XPUINT64):
+    def init_axis(self):
+        self.axis = -2
+
+
+class TestArgsortOpDescendingAxisXPUINT64(TestArgsortOpAxis0XPUINT64):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis0XPUINT64(TestArgsortOpAxis0XPUINT64):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis1XPUINT64(TestArgsortOpAxis1XPUINT64):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxis2XPUINT64(TestArgsortOpAxis2XPUINT64):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisNeg1XPUINT64(TestArgsortOpAxisNeg1XPUINT64):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpDescendingAxisNeg2XPUINT64(TestArgsortOpAxisNeg2XPUINT64):
+    def init_direction(self):
+        self.descending = True
+
+
+class TestArgsortOpAxis0XPUINT(TestArgsortOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "argsort"
+        self.place = paddle.XPUPlace(0)
+        self.init_dtype()
+        self.init_inputshape()
+        self.init_axis()
+        self.init_direction()
+
+        self.x = np.random.randint(
+            low=-1000, high=1000, size=self.input_shape).astype(self.dtype)
+        self.inputs = {"X": self.x}
+        self.attrs = {"axis": self.axis, "descending": self.descending}
+        self.get_output()
+        self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_dtype(self):
+        self.dtype = 'int'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index 8132a78f69675..9cd34c82650e9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -267,5 +267,58 @@ def test_train(self):
                         outputs[name], outs[id], atol=1e-4), True)
 
 
+class TestXPUBatchNormOpUseGlobalStats(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.XPUPlace(0)]
+        self.init_test()
+
+    ### train mode
+    def init_test(self):
+        self.use_global_stats = True
+        self.trainable_statistics = False
+
+    def test_global_stats(self):
+        for p in self.places:
+            with fluid.dygraph.guard(p):
+                x = paddle.randn([2, 6, 6, 4])
+                net1 = paddle.fluid.dygraph.BatchNorm(
+                    6,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(1.0)),
+                    use_global_stats=self.use_global_stats,
+                    trainable_statistics=self.trainable_statistics)
+                net2 = paddle.nn.BatchNorm2D(
+                    6, use_global_stats=self.use_global_stats)
+                net2.weight = net1.weight
+                net2.bias = net1.bias
+                if self.trainable_statistics == True:
+                    net1.training = False
+                    net2.training = False
+                y1 = net1(x)
+                y2 = net2(x)
+                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+
+
+class TestXPUBatchNormUseGlobalStatsCase1(TestXPUBatchNormOpUseGlobalStats):
+    ### test mode
+    def init_test(self):
+        self.use_global_stats = False
+        self.trainable_statistics = True
+
+
+class TestXPUBatchNormUseGlobalStatsCase2(TestXPUBatchNormOpUseGlobalStats):
+    ### train mode
+    def init_test(self):
+        self.use_global_stats = False
+        self.trainable_statistics = False
+
+
+class TestXPUBatchNormUseGlobalStatsCase3(TestXPUBatchNormOpUseGlobalStats):
+    ### test mode
+    def init_test(self):
+        self.use_global_stats = True
+        self.trainable_statistics = True
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
new file mode 100644
index 0000000000000..0cd98d2daea2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+
+
+def huber_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+class TestHuberLossOp(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = 'huber_loss'
+        self.place = paddle.XPUPlace(0)
+
+        self.init_dtype()
+
+        self.set_inputs()
+        self.set_attrs()
+        self.set_outputs()
+
+    def set_inputs(self):
+        shape = self.set_shape()
+        x = np.random.uniform(0, 1., shape).astype(self.dtype)
+        y = np.random.uniform(0, 1., shape).astype(self.dtype)
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+
+    def set_attrs(self):
+        self.attrs = {'delta': 0.5}
+
+    def set_outputs(self):
+        delta = self.attrs['delta']
+        shape = self.set_shape()
+        residual = self.inputs['Y'] - self.inputs['X']
+        loss = np.vectorize(huber_loss_forward)(residual,
+                                                delta).astype(self.dtype)
+        self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)}
+
+    def set_shape(self):
+        return (100, 1)
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'], 'Out', no_grad_set=set("residual"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', no_grad_set=set('residual'))
+
+
+def TestHuberLossOp1(TestHuberLossOp):
+    def set_shape(self):
+        return (64)
+
+
+def TestHuberLossOp2(TestHuberLossOp):
+    def set_shape(self):
+        return (6, 6)
+
+
+def TestHuberLossOp3(TestHuberLossOp):
+    def set_shape(self):
+        return (6, 6, 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
index 44c356ca65f29..5e866dddbe28e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
@@ -29,7 +29,7 @@
 class TestMeanOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
         self.attrs = {'use_xpu': True}
         self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
 
@@ -46,7 +46,7 @@ class TestMeanOp5D(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {
-            'X': np.random.random((1, 2, 5, 6, 10)).astype("float64")
+            'X': np.random.random((1, 2, 5, 6, 10)).astype("float32")
         }
         self.attrs = {'use_xpu': True}
         self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
@@ -64,7 +64,7 @@ class TestMeanOp6D(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {
-            'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float64")
+            'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float32")
         }
         self.attrs = {'use_xpu': True}
         self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
@@ -82,7 +82,7 @@ class TestMeanOp8D(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {
-            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float64")
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float32")
         }
         self.attrs = {'dim': (0, 3), 'use_xpu': True}
         self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))}
@@ -99,7 +99,7 @@ def test_check_grad(self):
 class Test1DReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random(120).astype("float64")}
+        self.inputs = {'X': np.random.random(120).astype("float32")}
         self.attrs = {'use_xpu': True}
         self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
 
@@ -116,7 +116,7 @@ class Test2DReduce0(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [0], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
         self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
 
 
@@ -124,7 +124,7 @@ class Test2DReduce1(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [1], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
         self.outputs = {
             'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
         }
@@ -134,7 +134,7 @@ class Test3DReduce0(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [1], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
         self.outputs = {
             'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
         }
@@ -144,7 +144,7 @@ class Test3DReduce1(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [2], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
         self.outputs = {
             'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
         }
@@ -154,7 +154,7 @@ class Test3DReduce2(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [-2], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
         self.outputs = {
             'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
         }
@@ -164,7 +164,7 @@ class Test3DReduce3(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.attrs = {'dim': [1, 2], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
         self.outputs = {
             'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
         }
@@ -173,7 +173,7 @@ def setUp(self):
 class TestKeepDimReduce(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
         self.attrs = {'dim': [1], 'keep_dim': True, 'use_xpu': True}
         self.outputs = {
             'Out': self.inputs['X'].mean(
@@ -185,7 +185,7 @@ class TestKeepDim8DReduce(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {
-            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float32")
         }
         self.attrs = {'dim': (3, 4, 5), 'keep_dim': True, 'use_xpu': True}
         self.outputs = {
@@ -197,7 +197,7 @@ def setUp(self):
 class TestReduceAll(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
         self.attrs = {'reduce_all': True, 'use_xpu': True}
         self.outputs = {'Out': self.inputs['X'].mean()}
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
new file mode 100644
index 0000000000000..44686ae418bfc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
@@ -0,0 +1,155 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test_xpu import OpTest, XPUOpTest
+from op_test import skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+class TestXPUReduceProdOp(XPUOpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.use_xpu = True
+        self.use_mkldnn = False
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keep_dim,
+            'reduce_all': self.reduce_all
+        }
+        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+        if self.attrs['reduce_all']:
+            self.outputs = {'Out': self.inputs['X'].prod()}
+        else:
+            self.outputs = {
+                'Out': self.inputs['X'].prod(
+                    axis=self.axis, keepdims=self.attrs['keep_dim'])
+            }
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+    def init_op_type(self):
+        self.op_type = "reduce_prod"
+        self.use_mkldnn = False
+        self.keep_dim = False
+        self.reduce_all = False
+
+    def initTestCase(self):
+        self.shape = (5, 6, 10)
+        self.axis = (0, )
+
+
+class TestProdOp5D(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (1, 2, 5, 6, 10)
+        self.axis = (0, )
+
+
+class TestProdOp6D(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (1, 1, 2, 5, 6, 10)
+        self.axis = (0, )
+
+
+class TestProdOp8D(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
+        self.axis = (0, 3)
+
+
+class Test1DReduce(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = 120
+        self.axis = (0, )
+
+
+class Test2DReduce0(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (20, 10)
+        self.axis = (0, )
+
+
+class Test2DReduce1(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (20, 10)
+        self.axis = (1, )
+
+
+class Test3DReduce0(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (1, )
+
+
+class Test3DReduce1(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (2, )
+
+
+class Test3DReduce2(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (-2, )
+
+
+class Test3DReduce3(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (1, 2)
+
+
+class TestKeepDimReduce(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 10)
+        self.axis = (1, )
+        self.keep_dim = True
+
+
+class TestKeepDim8DReduce(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
+        self.axis = (3, 4, 5)
+        self.keep_dim = True
+
+
+class TestReduceAll(TestXPUReduceProdOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 2, 10)
+        self.axis = (0, )
+        self.reduce_all = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
new file mode 100644
index 0000000000000..16b75cd3f0145
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
@@ -0,0 +1,169 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+
+
+class TestScatterOp(XPUOpTest):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "scatter"
+        self.place = paddle.XPUPlace(0)
+
+        ref_np = np.ones((3, 50)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 50)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestScatterOp0(TestScatterOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "scatter"
+        self.place = paddle.XPUPlace(0)
+
+        ref_np = np.ones((3, 3)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.attrs = {'overwrite': True}
+        self.outputs = {'Out': output_np}
+
+
+class TestScatterOp1(TestScatterOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "scatter"
+        self.place = paddle.XPUPlace(0)
+
+        ref_np = np.ones((3, 3)).astype("float32")
+        zeros_np = np.zeros([2, 3]).astype('float32')
+        index_np = np.array([1, 1]).astype("int32")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = zeros_np
+        for i in range(0, len(index_np)):
+            output_np[index_np[i]] += updates_np[i]
+        self.attrs = {'overwrite': False}
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+
+class TestScatterOp2(TestScatterOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "scatter"
+        self.place = paddle.XPUPlace(0)
+
+        ref_np = np.ones((3, 3)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+
+class TestScatterOp3(TestScatterOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "scatter"
+        self.place = paddle.XPUPlace(0)
+
+        ref_np = np.ones((3, 3)).astype("float32")
+        zeros_np = np.zeros([2, 3]).astype('float32')
+        index_np = np.array([1, 1]).astype("int32")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = zeros_np
+        for i in range(0, len(index_np)):
+            output_np[index_np[i]] += updates_np[i]
+        self.attrs = {'overwrite': False}
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+
+class TestScatterOp4(TestScatterOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "scatter"
+        self.place = paddle.XPUPlace(0)
+
+        ref_np = np.ones((3, 3)).astype("float32")
+        index_np = np.array([1, 2]).astype("int64")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+
+class TestScatterOp5(TestScatterOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "scatter"
+        self.place = paddle.XPUPlace(0)
+
+        ref_np = np.ones((3, 3)).astype("float32")
+        index_np = np.array([1, 2]).astype("int64")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+
+class TestScatterOp6(TestScatterOp):
+    def setUp(self):
+        self.set_xpu()
+        self.op_type = "scatter"
+        self.place = paddle.XPUPlace(0)
+
+        ref_np = np.ones((3, 3)).astype("int64")
+        index_np = np.array([1, 2]).astype("int64")
+        updates_np = np.random.random((2, 3)).astype("int64")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
new file mode 100644
index 0000000000000..4ceacd5209234
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
@@ -0,0 +1,164 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test_xpu import OpTest, XPUOpTest
+from op_test import skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+from scipy.special import logit
+from scipy.special import expit
+
+paddle.enable_static()
+
+
+class TestSigmoidCrossEntropyWithLogitsOp1(XPUOpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        self.set_xpu()
+        self.init_dtype()
+
+        batch_size = 64
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype(self.dtype)),
+            'Label': np.random.randint(0, 2, (batch_size, num_classes))
+            .astype(self.dtype)
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def set_xpu(self):
+        self.__class__.use_xpu = True
+        self.place = paddle.XPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestSigmoidCrossEntropyWithLogitsOp3(
+        TestSigmoidCrossEntropyWithLogitsOp1):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        self.set_xpu()
+        self.init_dtype()
+
+        batch_size = 64
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype(self.dtype)),
+            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
+            .astype(self.dtype)
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+
+class TestSigmoidCrossEntropyWithLogitsOp5(
+        TestSigmoidCrossEntropyWithLogitsOp1):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        self.set_xpu()
+        self.init_dtype()
+
+        batch_size = [10, 10]
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype(self.dtype)),
+            'Label': np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+            .astype(self.dtype)
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+
+class TestSigmoidCrossEntropyWithLogitsOp6(
+        TestSigmoidCrossEntropyWithLogitsOp1):
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        self.set_xpu()
+        self.init_dtype()
+
+        batch_size = [10, 10]
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype(self.dtype)),
+            'Label': np.random.randint(0, 2, tuple(batch_size + [num_classes]))
+            .astype(self.dtype)
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
new file mode 100644
index 0000000000000..20fd837ecee20
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+# test with attr(num)
+class TestSplitOp(XPUOpTest):
+    def initDefaultParameters(self):
+        self.dtype = 'float32'
+        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.axis = 2
+        self.sections = []
+        self.num = 3
+        self.indices_or_sections = 3
+
+    def setUp(self):
+        self.__class__.op_type = 'split'
+        self.use_xpu = True
+        self.use_mkldnn = False
+        self.initDefaultParameters()
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'axis': self.axis,
+            'sections': self.sections,
+            'num': self.num
+        }
+
+        out = np.split(self.x, self.indices_or_sections, self.axis)
+        self.outputs = {'Out': [('out%d' % i, out[i]) \
+                                for i in range(len(out))]}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+# unknown sections
+class TestSplitOp_2(XPUOpTest):
+    def initDefaultParameters(self):
+        self.dtype = 'float32'
+        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.axis = 2
+        self.sections = [2, 1, -1]
+        self.num = 0
+        self.indices_or_sections = [2, 3]
+
+    def setUp(self):
+        self.__class__.op_type = 'split'
+        self.use_xpu = True
+        self.use_mkldnn = False
+        self.initDefaultParameters()
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'axis': self.axis,
+            'sections': self.sections,
+            'num': self.num
+        }
+        out = np.split(self.x, self.indices_or_sections, self.axis)
+        self.outputs = {'Out': [('out%d' % i, out[i]) \
+                                for i in range(len(out))]}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+# test with int32
+class TestSplitOp_5(XPUOpTest):
+    def initDefaultParameters(self):
+        self.dtype = 'int32'
+        self.x = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.axis = 2
+        self.sections = []
+        self.num = 3
+        self.indices_or_sections = 3
+
+    def setUp(self):
+        self.__class__.op_type = 'split'
+        self.use_xpu = True
+        self.use_mkldnn = False
+        self.initDefaultParameters()
+        self.inputs = {'X': self.x}
+        self.attrs = {
+            'axis': self.axis,
+            'sections': self.sections,
+            'num': self.num
+        }
+
+        out = np.split(self.x, self.indices_or_sections, self.axis)
+        self.outputs = {'Out': [('out%d' % i, out[i]) \
+                                for i in range(len(out))]}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            paddle.enable_static()
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
index 8ae588975a56a..8ab556efd4241 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
@@ -25,6 +25,7 @@
 from paddle.fluid.op import Operator
 from paddle.fluid.tests.unittests.op_test import (
     OpTest, convert_float_to_uint16, convert_uint16_to_float)
+from paddle import _C_ops
 
 paddle.enable_static()
 
@@ -171,11 +172,11 @@ class TestSumOpError(unittest.TestCase):
     def test_errors(self):
         def test_empty_list_input():
             with fluid.dygraph.guard():
-                fluid.core.ops.sum([])
+                fluid._C_ops.sum([])
 
         def test_list_of_none_input():
             with fluid.dygraph.guard():
-                fluid.core.ops.sum([None])
+                fluid._C_ops.sum([None])
 
         self.assertRaises(Exception, test_empty_list_input)
         self.assertRaises(Exception, test_list_of_none_input)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
new file mode 100644
index 0000000000000..a0f4b4244355b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
@@ -0,0 +1,289 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+paddle.enable_static()
+
+
+def numpy_topk(x, k=1, axis=-1, largest=True):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if largest:
+        indices = np.argsort(-x, axis=axis)
+    else:
+        indices = np.argsort(x, axis=axis)
+    if largest:
+        value = -np.sort(-x, axis=axis)
+    else:
+        value = np.sort(x, axis=axis)
+    indices = indices.take(indices=range(0, k), axis=axis)
+    value = value.take(indices=range(0, k), axis=axis)
+    return value, indices
+
+
+class TestTopkOp(OpTest):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 20)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad(set(['X']), 'Out')
+
+
+class TestTopkOp1(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp2(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp3(TestTopkOp):
+    def init_args(self):
+        self.k = 5
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp4(TestTopkOp):
+    def init_args(self):
+        self.k = 1
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp5(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 2
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp6(TestTopkOp):
+    def init_args(self):
+        self.k = 5
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(8, 32, 64)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp7(TestTopkOp):
+    def init_args(self):
+        self.k = 10
+        self.axis = 2
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(8, 5, 10, 16)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp8(TestTopkOp):
+    def init_args(self):
+        self.k = 1
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(8, 32, 64)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp9(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp10(TestTopkOp):
+    def init_args(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp11(TestTopkOp):
+    def init_args(self):
+        self.k = 5
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+class TestTopkOp12(TestTopkOp):
+    def init_args(self):
+        self.k = 1
+        self.axis = 1
+        self.largest = True
+
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.dtype = np.float32
+        self.input_data = np.random.rand(10, 10, 5)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 5aa7f9c972f9b..f3763cb447f39 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -665,9 +665,16 @@ def _setitem_impl_(var, item, value):
             "paddle.Tensor to a paddle.Tensor, but received {}".format(
                 type(value)))
 
+    if paddle.fluid.framework.in_dygraph_mode():
+        var._bump_inplace_version()
+
     cur_block = default_main_program().current_block()
     cur_block.append_op(
-        type="set_value", inputs=inputs, outputs={'Out': var}, attrs=attrs)
+        type="set_value",
+        inputs=inputs,
+        outputs={'Out': var},
+        attrs=attrs,
+        inplace_map={"Input": "Out"})
 
     return var
 
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 722003c034091..a0503322806e5 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -19,6 +19,7 @@
 from .framework import get_default_dtype  # noqa: F401
 from .framework import set_default_dtype  # noqa: F401
 from .framework import set_grad_enabled  # noqa: F401
+from .framework import is_grad_enabled  # noqa: F401
 
 from ..fluid.param_attr import ParamAttr  # noqa: F401
 from ..fluid.layers.tensor import create_parameter  # noqa: F401
@@ -27,6 +28,7 @@
 from ..fluid.core import CUDAPlace  # noqa: F401
 from ..fluid.core import CUDAPinnedPlace  # noqa: F401
 from ..fluid.core import NPUPlace  # noqa: F401
+from ..fluid.core import MLUPlace  # noqa: F401
 from ..fluid.core import VarBase  # noqa: F401
 
 from paddle.fluid import core  # noqa: F401
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index e9d690c28d60e..e899d267289d5 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -116,3 +116,28 @@ def set_grad_enabled(mode):
             tracer._has_grad = prev_mode
     else:
         yield
+
+
+def is_grad_enabled():
+    """
+    Returns whether current dygraph gradient calculation mode is enabled.
+
+    Returns:
+        bool: True if current dygraph gradient calculation mode is enabled, otherwise false.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            
+            # Dygraph gradient calculation mode is enabled by default.
+            paddle.is_grad_enabled() # True
+
+            with paddle.set_grad_enabled(False):
+                paddle.is_grad_enabled() # False
+
+            paddle.enable_static()
+            paddle.is_grad_enabled() # False
+    """
+    tracer = _dygraph_tracer()
+    return tracer._has_grad if tracer else False
diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py
index 9b8f542658dd6..45810621e4207 100644
--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -16,6 +16,7 @@
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid import core
+from paddle import _C_ops
 
 
 def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
@@ -82,8 +83,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
             % pool_type)
 
     if in_dygraph_mode():
-        out, tmp = core.ops.graph_send_recv(x, src_index, dst_index,
-                                            'pool_type', pool_type.upper())
+        out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, 'pool_type',
+                                          pool_type.upper())
         return out
 
     check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"),
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse.py b/python/paddle/incubate/operators/softmax_mask_fuse.py
index 4c95a1ab51288..21f7558b537a2 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse.py
@@ -17,6 +17,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid import core
+from paddle import _C_ops
 
 
 def softmax_mask_fuse(x, mask, name=None):
@@ -58,7 +59,7 @@ def softmax_mask_fuse(x, mask, name=None):
             # [[[[0.02404429, 0.04658398, 0.02746007, ..., 0.01489375, 0.02397441, 0.02851614] ... ]]]
     """
     if in_dygraph_mode():
-        out = core.ops.fused_softmax_mask(x, mask)
+        out = _C_ops.fused_softmax_mask(x, mask)
         return out
     helper = LayerHelper('fused_softmax_mask', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
diff --git a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
index 918adf8c21a1c..be37330142401 100644
--- a/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
+++ b/python/paddle/incubate/operators/softmax_mask_fuse_upper_triangle.py
@@ -17,6 +17,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid import core
+from paddle import _C_ops
 
 
 def softmax_mask_fuse_upper_triangle(x):
@@ -58,7 +59,7 @@ def softmax_mask_fuse_upper_triangle(x):
             #     ... ]]]
     """
     if in_dygraph_mode():
-        out = core.ops.fused_softmax_mask_upper_triangle(x)
+        out = _C_ops.fused_softmax_mask_upper_triangle(x)
         return out
 
     helper = LayerHelper('fused_softmax_mask_upper_triangle', **locals())
diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py
index 119db0894f917..d6b8d6363690a 100644
--- a/python/paddle/linalg.py
+++ b/python/paddle/linalg.py
@@ -15,26 +15,32 @@
 from .tensor.linalg import cholesky  # noqa: F401
 from .tensor.linalg import norm  # noqa: F401
 from .tensor.linalg import eig  # noqa: F401
+from .tensor.linalg import cov  # noqa: F401
 from .tensor.linalg import cond  # noqa: F401
 from .tensor.linalg import matrix_power  # noqa: F401
 from .tensor.linalg import solve  # noqa: F401
+from .tensor.linalg import cholesky_solve  # noqa: F401
 from .tensor import inverse as inv  # noqa: F401
 from .tensor.linalg import eigvals  # noqa: F401
 from .tensor.linalg import multi_dot  # noqa: F401
-from .tensor.linalg import matrix_rank
-from .tensor.linalg import svd
-from .tensor.linalg import eigvalsh
-from .tensor.linalg import qr
+from .tensor.linalg import matrix_rank  # noqa: F401
+from .tensor.linalg import svd  # noqa: F401
+from .tensor.linalg import eigvalsh  # noqa: F401
+from .tensor.linalg import qr  # noqa: F401
+from .tensor.linalg import lu  # noqa: F401
+from .tensor.linalg import lu_unpack  # noqa: F401
 from .tensor.linalg import eigh  # noqa: F401
-from .tensor.linalg import det
-from .tensor.linalg import slogdet
-from .tensor.linalg import pinv
-from .tensor.linalg import triangular_solve
+from .tensor.linalg import det  # noqa: F401
+from .tensor.linalg import slogdet  # noqa: F401
+from .tensor.linalg import pinv  # noqa: F401
+from .tensor.linalg import triangular_solve  # noqa: F401
+from .tensor.linalg import lstsq
 
 __all__ = [
     'cholesky',  #noqa
     'norm',
     'cond',
+    'cov',
     'inv',
     'eig',
     'eigvals',
@@ -42,6 +48,8 @@
     'matrix_rank',
     'svd',
     'qr',
+    'lu',
+    'lu_unpack',
     'matrix_power',
     'det',
     'slogdet',
@@ -49,5 +57,7 @@
     'eigvalsh',
     'pinv',
     'solve',
+    'cholesky_solve',
     'triangular_solve',
+    'lstsq'
 ]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index e1c40e8d0d3d7..ad8f28f40bb58 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -46,6 +46,7 @@
 from .layer.activation import Softshrink  # noqa: F401
 from .layer.activation import Softsign  # noqa: F401
 from .layer.activation import Swish  # noqa: F401
+from .layer.activation import Mish  # noqa: F401
 from .layer.activation import Tanhshrink  # noqa: F401
 from .layer.activation import ThresholdedReLU  # noqa: F401
 from .layer.activation import LogSoftmax  # noqa: F401
@@ -68,6 +69,7 @@
 from .layer.common import Dropout3D  # noqa: F401
 from .layer.common import AlphaDropout  # noqa: F401
 from .layer.common import Unfold  # noqa: F401
+from .layer.common import Fold  # noqa: F401
 
 from .layer.pooling import AvgPool1D  # noqa: F401
 from .layer.pooling import AvgPool2D  # noqa: F401
@@ -75,7 +77,9 @@
 from .layer.pooling import MaxPool1D  # noqa: F401
 from .layer.pooling import MaxPool2D  # noqa: F401
 from .layer.pooling import MaxPool3D  # noqa: F401
+from .layer.pooling import MaxUnPool1D  # noqa: F401
 from .layer.pooling import MaxUnPool2D  # noqa: F401
+from .layer.pooling import MaxUnPool3D  # noqa: F401
 from .layer.pooling import AdaptiveAvgPool1D  # noqa: F401
 from .layer.pooling import AdaptiveAvgPool2D  # noqa: F401
 from .layer.pooling import AdaptiveAvgPool3D  # noqa: F401
@@ -215,6 +219,7 @@ def weight_norm(*args):
            'Bilinear',
            'AlphaDropout',
            'Unfold',
+           'Fold',
            'RNNCellBase',
            'SimpleRNNCell',
            'LSTMCell',
@@ -292,11 +297,14 @@ def weight_norm(*args):
            'LogSoftmax',
            'Sigmoid',
            'Swish',
+           'Mish',
            'PixelShuffle',
            'ELU',
            'ReLU6',
            'LayerDict',
            'ZeroPad2D',
+           'MaxUnPool1D',
            'MaxUnPool2D',
+           'MaxUnPool3D',
            'HingeEmbeddingLoss',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index a504c1ee6a4fe..a24afc45a5995 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -39,6 +39,7 @@
 from .activation import softshrink  # noqa: F401
 from .activation import softsign  # noqa: F401
 from .activation import swish  # noqa: F401
+from .activation import mish  # noqa: F401
 from .activation import tanh  # noqa: F401
 from .activation import tanh_  # noqa: F401
 from .activation import tanhshrink  # noqa: F401
@@ -55,6 +56,7 @@
 from .common import zeropad2d  # noqa: F401
 from .common import cosine_similarity  # noqa: F401
 from .common import unfold  # noqa: F401
+from .common import fold
 from .common import interpolate  # noqa: F401
 from .common import upsample  # noqa: F401
 from .common import bilinear  # noqa: F401
@@ -105,7 +107,9 @@
 from .pooling import adaptive_avg_pool1d  # noqa: F401
 from .pooling import adaptive_avg_pool2d  # noqa: F401
 from .pooling import adaptive_avg_pool3d  # noqa: F401
+from .pooling import max_unpool1d  # noqa: F401
 from .pooling import max_unpool2d  # noqa: F401
+from .pooling import max_unpool3d  # noqa: F401
 
 from .vision import affine_grid  # noqa: F401
 from .vision import grid_sample  # noqa: F401
@@ -148,6 +152,7 @@
            'sigmoid',
            'silu',
            'swish',
+           'mish',
            'tanh',
            'tanh_',
            'tanhshrink',
@@ -176,7 +181,9 @@
            'max_pool1d',
            'max_pool2d',
            'max_pool3d',
+           'max_unpool1d',
            'max_unpool2d',
+           'max_unpool3d',
            'adaptive_avg_pool1d',
            'adaptive_avg_pool2d',
            'adaptive_avg_pool3d',
@@ -216,4 +223,5 @@
            'instance_norm',
            'class_center_sample',
            'sparse_attention',
+           'fold',
 ]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 4a071c2fe74f1..ac08ac9391eb3 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -1174,6 +1174,46 @@ def swish(x, name=None):
     return out
 
 
+def mish(x, name=None):
+    r"""
+    mish activation.
+
+    ..  math::
+
+        softplus(x) = \begin{cases}
+                x, \text{if } x > \text{threshold} \\
+                \ln(1 + e^{x}),  \text{otherwise}
+            \end{cases}
+
+        mish(x) = x * \tanh(softplus(x))
+    
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.to_tensor([-5., 0., 5.])
+            out = F.mish(x) # [-0.03357624, 0., 4.99955208]
+    """
+    if in_dygraph_mode():
+        return _C_ops.mish(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mish')
+    helper = LayerHelper('mish', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='mish', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
 def tanhshrink(x, name=None):
     """
     tanhshrink activation
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 98019ceb480a0..3dba9505e92c7 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1763,7 +1763,7 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
         seed = default_main_program().random_seed
 
     if in_dygraph_mode():
-        remapped_label, sampled_class_center = core.ops.class_center_sample(
+        remapped_label, sampled_class_center = _C_ops.class_center_sample(
             label, 'num_classes', num_classes, 'num_samples', num_samples,
             'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed',
             seed is not None, 'seed', seed if seed is not None else 0)
@@ -1794,3 +1794,130 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
             'seed': seed if seed is not None else 0
         })
     return remapped_label, sampled_class_center
+
+
+def fold(x,
+         output_sizes,
+         kernel_sizes,
+         strides=1,
+         paddings=0,
+         dilations=1,
+         name=None):
+    r"""
+    
+    This Op is used to combines an array of sliding local blocks into a large containing
+    tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
+    combined value in the resulting large tensor by summing all values from all containing blocks. 
+
+
+    For each input :math:`x` with shape [N, C_in , L], the output shape [N, C_out, H_out, W_out]
+    can be calculated as following.
+
+    .. math::
+
+        H_out &= output_size[0]
+        W_out &= output_size[1]
+        C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
+
+    Parameters:
+        x(Tensor):                3-D Tensor, input tensor of format [N, C, L],
+                                  data type can be float32 or float64
+        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+                                  or an interger o treated as [o, o].
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        The tensor formed by combining a group of sliding local blocks
+        The output shape is [N, Cout, H, W] as decriabled above.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.randn([2,12,9])
+            y = F.fold(x, output_sizes=(4, 4), kernel_sizes=2)
+            # y.shape = [2,3,4,4]
+
+    """
+
+    helper = LayerHelper("fold", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fold')
+
+    assert len(x.shape) == 3, \
+            "input should be the format of [N, C, L]"
+
+    if isinstance(output_sizes, int):
+        output_sizes = [output_sizes, output_sizes]
+    else:
+        assert isinstance(output_sizes, list) and (len(output_sizes) == 2), \
+            "output_sizes should either be an integer or a list of two integers"
+
+    if isinstance(kernel_sizes, int):
+        kernel_sizes = [kernel_sizes, kernel_sizes]
+    else:
+        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list of two integers"
+
+    if isinstance(strides, int):
+        strides = [strides, strides]
+    else:
+        assert isinstance(strides, list) and (len(strides) == 2), \
+            "strides should either be an integer or a list of two integers"
+
+    if isinstance(dilations, int):
+        dilations = [dilations, dilations]
+    else:
+        assert isinstance(dilations, list) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list of two integers"
+
+    if isinstance(paddings, int):
+        paddings = [paddings] * 4
+    elif isinstance(paddings, list):
+        if len(paddings) == 2:
+            paddings = paddings * 2
+        elif len(paddings) == 4:
+            pass
+        else:
+            raise ValueError(
+                "paddings should either be an integer or a list of 2 or 4 integers"
+            )
+    else:
+        raise ValueError(
+            "Unexpected type of paddings, it should be either an integer or a list"
+            "of 2 or 4 integers")
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="fold",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs={
+            "output_sizes": output_sizes,
+            "kernel_sizes": kernel_sizes,
+            "strides": strides,
+            "paddings": paddings,
+            "dilations": dilations
+        })
+    return out
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 328eb07b5e960..554651ea1332c 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1320,7 +1320,7 @@ def margin_cross_entropy(logits,
         label = paddle.unsqueeze(label, axis=-1)
 
     if in_dygraph_mode():
-        softmax, loss = core.ops.margin_cross_entropy(
+        softmax, loss = _C_ops.margin_cross_entropy(
             logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks,
             'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale',
             scale, 'return_softmax', return_softmax)
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 27f4d4a7db345..db9665f7a32c4 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -664,6 +664,115 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
     return ret
 
 
+def max_unpool1d(x,
+                 indices,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 data_format="NCL",
+                 output_size=None,
+                 name=None):
+    """
+    This API implements max unpooling 1d opereation.
+    `max_unpool1d` accepts the output of `max_pool1d` as input, 
+    including the indices of the maximum value and calculate the partial inverse. 
+    All non-maximum values ​​are set to zero.
+
+    - Input: :math:`(N, C, L_{in})`
+    - Output: :math:`(N, C, L_{out})`, where
+    
+    .. math::
+        L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
+
+    or as given by :attr:`output_size` in the call operator.
+
+
+    Args:
+        x (Tensor): The input tensor of unpooling operator which is a 3-D tensor with
+                          shape [N, C, L]. The format of input tensor is `"NCL"`, 
+                          where `N` is batch size, `C` is the number of channels, `L` is
+                          the length of the feature. The data type is float32 or float64.
+        indices (Tensor): The indices given out by maxpooling1d which is a 3-D tensor with
+                          shape [N, C, L]. The format of input tensor is `"NCL"` , 
+                          where `N` is batch size, `C` is the number of channels, `L` is
+                          the length of the featuree. The data type is float32 or float64.
+        kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
+            it must contain an integer.
+        padding (int | tuple): Padding that was added to the input.
+        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+                           the actual output shape will be automatically calculated by (input_shape,
+                           kernel_size, stride, padding).
+        data_format (string): The data format of the input and output data.
+                        The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_length]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of unpooling result. 
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            import paddle.nn.functional as F
+
+            data = paddle.rand(shape=[1, 3, 16])
+            pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            # pool_out shape: [1, 3, 8],  indices shape: [1, 3, 8]
+            unpool_out = F.max_unpool1d(pool_out, indices, kernel_size=2, padding=0)
+            # unpool_out shape: [1, 3, 16]
+
+    """
+    """NCL to NCHW"""
+    if data_format not in ["NCL"]:
+        raise ValueError("Attr(data_format) should be 'NCL'. Received "
+                         "Attr(data_format): %s." % str(data_format))
+    data_format = "NCHW"
+    x = unsqueeze(x, [2])
+    indices = unsqueeze(indices, [2])
+    kernel_size = [1] + utils.convert_to_list(kernel_size, 1, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
+    padding, padding_algorithm = _update_padding_nd(padding, 1)
+    # use 2d to implenment 1d should expand padding in advance.
+    padding = _expand_low_nd_padding(padding)
+
+    output_size = _unpool_output_size(x, kernel_size, stride, padding,
+                                      output_size)
+
+    if in_dygraph_mode():
+        output = _C_ops.unpool(x, indices, 'unpooling_type', 'max', 'ksize',
+                               kernel_size, 'strides', stride, 'paddings',
+                               padding, "output_size", output_size,
+                               "data_format", data_format)
+        return squeeze(output, [2])
+
+    op_type = "unpool"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype(input_param_name="x")
+    unpool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x,
+                "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size
+        })
+    return squeeze(unpool_out, [2])
+
+
 def max_unpool2d(x,
                  indices,
                  kernel_size,
@@ -779,6 +888,118 @@ def max_unpool2d(x,
     return unpool_out
 
 
+def max_unpool3d(x,
+                 indices,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 data_format="NCDHW",
+                 output_size=None,
+                 name=None):
+    """
+    This API implements max unpooling 3d opereation.
+    `max_unpool3d` accepts the output of `max_pool3d` as input, 
+    including the indices of the maximum value and calculate the partial inverse. 
+    All non-maximum values ​​are set to zero.
+
+    - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+    - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+    
+    .. math::
+        D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
+
+    .. math::
+        H_{out} = (H_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1]
+
+    .. math::
+        W_{out} = (W_{in} - 1) * stride[2] - 2 * padding[2] + kernel\_size[2]
+
+    or as given by :attr:`output_size` in the call operator
+
+
+    Args:
+        x (Tensor): The input tensor of unpooling operator which is a 5-D tensor with
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"`, 
+                          where `N` is batch size, `C` is the number of channels, `D` is
+                          the depth of the feature, `H` is the height of the feature, 
+                          and `W` is the width of the feature. The data type is float32 or float64.
+        indices (Tensor): The indices given out by maxpooling3d which is a 5-D tensor with
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` , 
+                          where `N` is batch size, `C` is the number of channels, `D` is
+                          the depth of the feature, `H` is the height of the feature, 
+                          and `W` is the width of the feature. The data type is float32 or float64.
+        kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
+            it must contain an integer.
+        padding (int | tuple): Padding that was added to the input.
+        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+                           the actual output shape will be automatically calculated by (input_shape,
+                           kernel_size, stride, padding).
+        data_format (string): The data format of the input and output data.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Tensor: The output tensor of unpooling result. 
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            import paddle.nn.functional as F
+
+            data = paddle.rand(shape=[1, 1, 4, 4, 6])
+            pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            # pool_out shape: [1, 1, 2, 2, 3],  indices shape: [1, 1, 2, 2, 3]
+            unpool_out = F.max_unpool3d(pool_out, indices, kernel_size=2, padding=0)
+            # unpool_out shape: [1, 1, 4, 4, 6]
+
+    """
+    kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
+    if stride is None:
+        stride = kernel_size
+    else:
+        stride = utils.convert_to_list(stride, 3, 'pool_stride')
+    padding = utils.convert_to_list(padding, 3, 'padding')
+
+    if data_format not in ["NCDHW"]:
+        raise ValueError("Attr(data_format) should be 'NCDHW'. Received "
+                         "Attr(data_format): %s." % str(data_format))
+
+    output_size = _unpool_output_size(x, kernel_size, stride, padding,
+                                      output_size)
+
+    if in_dygraph_mode():
+        output = _C_ops.unpool3d(x, indices, 'unpooling_type', 'max', 'ksize',
+                                 kernel_size, 'strides', stride, 'paddings',
+                                 padding, "output_size", output_size,
+                                 "data_format", data_format)
+        return output
+
+    op_type = "unpool3d"
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype(input_param_name="x")
+    unpool_out = helper.create_variable_for_type_inference(dtype)
+
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x,
+                "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size
+        })
+    return unpool_out
+
+
 def max_pool2d(x,
                kernel_size,
                stride=None,
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 55765782e5a7d..26aa349b5b1b4 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -27,14 +27,17 @@ class Dirac(Initializer):
     as many channels are reserved as possible.
 
     In this initialize method, elements in the middle of convolution kernels will
-    be set to 1 . The formula can be described as:
+    be set to 1 . The formula can be described as follow.
 
-    $ Assuming:  N=min(in\_channels, out\_channels)$
+    .. math::
 
-    $ X[d, d, shape[2]//2, shape[3]//2, ...]=1,  \   d=0,1...N$
+        X[d, d, shape[2]//2, shape[3]//2, ...]=1,  \   d=0,1...N
+    
+    where, ``N`` is the minimum value of ``in_channels`` and ``out_channels``
 
     Args:
-        groups(int): 0-dimension of the Tensor will be divided by groups, each group has the same value.
+        groups(int, optional): 0-dimension of the Tensor will be divided by groups, 
+            each group has the same value. Default: 1.
         name(str, optional): The default value is None. Normally there is no need for user to set this
             property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -46,7 +49,7 @@ class Dirac(Initializer):
 
             import paddle
             
-            #1.For kernel_size is uneven number:
+            #1. For kernel_size is uneven number:
             
             attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Dirac())
             conv = paddle.nn.Conv1D(3, 2, 3, weight_attr=attr)
diff --git a/python/paddle/nn/initializer/orthogonal.py b/python/paddle/nn/initializer/orthogonal.py
index 8a3b9bf00278a..8e0acb9ab2d20 100644
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
@@ -24,18 +24,24 @@
 class Orthogonal(Initializer):
     """The orthogonal initializer. The initialized tensor is (semi) orthogonal.
 
-    Assuming that 'weight' will be initialized, its shape is [M, N].
+    It's only applied to Tensor whose dimension is greater than or equal to 2. 
+    
+    For the Tensor whose dimension is greater than 2, the 0 dimension is seen as ``rows`` , 
+    and the >=1 dimension are flattened as ``cols`` .
+
+    Which can be describe as:
 
     .. code-block:: text
 
-        if M < N:
+        rows = shape[0]
+        cols = shape[1]·shape[2]···shape[N]
+        
+        if rows < cols:
             The rows are orthogonal vectors
-        elif M > N:
+        elif rows > cols:
             The columns are orthogonal vectors
-        else M = N:
+        else rows = cols:
             Both rows and columns are orthogonal vectors
-    
-    Only Tensor with 2 or more dimensions can initialized by Orthogonal.
 
     Args:
         gain(float, optional): The multiplication coefficient for initialized tensor. Default: 1.0.
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index a78269a4cd4d7..2b50508065605 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -44,6 +44,7 @@
 from .common import Upsample  # noqa: F401
 from .common import UpsamplingBilinear2D  # noqa: F401
 from .common import UpsamplingNearest2D  # noqa: F401
+from .common import Fold
 from .pooling import AvgPool1D  # noqa: F401
 from .pooling import AvgPool2D  # noqa: F401
 from .pooling import AvgPool3D  # noqa: F401
@@ -56,7 +57,9 @@
 from .pooling import AdaptiveMaxPool1D  # noqa: F401
 from .pooling import AdaptiveMaxPool2D  # noqa: F401
 from .pooling import AdaptiveMaxPool3D  # noqa: F401
+from .pooling import MaxUnPool1D  # noqa: F401
 from .pooling import MaxUnPool2D  # noqa: F401
+from .pooling import MaxUnPool3D  # noqa: F401
 from .conv import Conv1D  # noqa: F401
 from .conv import Conv2D  # noqa: F401
 from .conv import Conv3D  # noqa: F401
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 45308f15f4a3b..617981cb8f74c 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -881,6 +881,51 @@ def extra_repr(self):
         return name_str
 
 
+class Mish(Layer):
+    r"""
+    Mish Activation.
+
+    ..  math::
+
+        softplus(x) = \begin{cases}
+                x, \text{if } x > \text{threshold} \\
+                \ln(1 + e^{x}),  \text{otherwise}
+            \end{cases}
+
+        Mish(x) = x * \tanh(softplus(x))
+    
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([-5., 0., 5.])
+            m = paddle.nn.Mish()
+            out = m(x) # [-0.03357624, 0., 4.99955208]
+
+    """
+
+    def __init__(self, name=None):
+        super(Mish, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.mish(x, self._name)
+
+    def extra_repr(self):
+        name_str = 'name={}'.format(self._name) if self._name else ''
+        return name_str
+
+
 class Tanhshrink(Layer):
     """
     Tanhshrink Activation
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 1069a24be21f8..22f7f798374d8 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1521,7 +1521,7 @@ class Unfold(Layer):
             unfold = nn.Unfold(kernel_sizes=[3, 3])
             result = unfold(x)
             print(result)
-   """
+    """
 
     def __init__(self,
                  kernel_sizes,
@@ -1550,3 +1550,92 @@ def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
                 format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
+
+
+class Fold(Layer):
+    """
+
+    This Op is used to combines an array of sliding local blocks into a large containing
+    tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
+    combined value in the resulting large tensor by summing all values from all containing blocks. 
+
+
+    For each input :math:`x` with shape [N, C_in , L], the output shape [N, C_out, H_out, W_out]
+    can be calculated as following.
+
+    .. math::
+
+        H_out &= output_size[0]
+        W_out &= output_size[1]
+        C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1]
+
+    Parameters:
+        output_sizes(list):       The size of output size, should be [output_size_h, output_size_w]
+                                  or an interger o treated as [o, o].
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        The tensor formed by combining a group of sliding local blocks
+        The output shape is [N, Cout, H, W] as decriabled above.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            x = paddle.randn([2,12,9])
+            fold = nn.Fold(output_sizes=(4, 4), kernel_sizes=2)
+            y = fold(x)
+            # y.shape = [2,3,4,4]
+   """
+
+    def __init__(self,
+                 output_sizes,
+                 kernel_sizes,
+                 dilations=1,
+                 paddings=0,
+                 strides=1,
+                 name=None):
+        super(Fold, self).__init__()
+
+        self.output_sizes = output_sizes
+        self.kernel_sizes = kernel_sizes
+        self.dilations = dilations
+        self.paddings = paddings
+        self.strides = strides
+        self.name = name
+
+    def forward(self, input):
+        return F.fold(
+            input,
+            output_sizes=self.output_sizes,
+            kernel_sizes=self.kernel_sizes,
+            strides=self.strides,
+            paddings=self.paddings,
+            dilations=self.dilations,
+            name=self.name)
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
+                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index b0e0fe323437d..de9f8663e6769 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -75,6 +75,7 @@ def __init__(self,
         self._epsilon = epsilon
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
+        self._num_features = num_features
 
         if weight_attr != False and bias_attr != False:
             self.scale = self.create_parameter(
@@ -101,7 +102,7 @@ def forward(self, input):
             input, weight=self.scale, bias=self.bias, eps=self._epsilon)
 
     def extra_repr(self):
-        return 'num_features={}, epsilon={}'.format(self.scale.shape[0],
+        return 'num_features={}, epsilon={}'.format(self._num_features,
                                                     self._epsilon)
 
 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index cc49db9b2056f..96942f5c8500a 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1130,6 +1130,88 @@ def extra_repr(self):
                                                        self._return_mask)
 
 
+class MaxUnPool1D(Layer):
+    """
+    This API implements max unpooling 1d opereation.
+
+    `max_unpool1d` accepts the output of `max_pool1d` as input, 
+    including the indices of the maximum value and calculate the partial inverse. 
+    All non-maximum values ​​are set to zero.
+
+    - Input: :math:`(N, C, L_{in})`
+    - Output: :math:`(N, C, L_{out})`, where
+    
+    .. math::
+        L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
+
+    or as given by :attr:`output_size` in the call operator.
+    
+    Parameters:
+        kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
+            it must contain an integer.
+        padding (int | tuple): Padding that was added to the input.
+        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+                           the actual output shape will be automatically calculated by (input_shape,
+                           kernel_size, stride, padding).
+        data_format (string): The data format of the input and output data.
+                        The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_length]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:
+        A callable object of MaxUnPool1D.
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            data = paddle.rand(shape=[1, 3, 16])
+            pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            # pool_out shape: [1, 3, 8],  indices shape: [1, 3, 8]
+            Unpool1D = paddle.nn.MaxUnPool1D(kernel_size=2, padding=0)
+            unpool_out = Unpool1D(pool_out, indices)
+            # unpool_out shape: [1, 3, 16]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 data_format="NCL",
+                 output_size=None,
+                 name=None):
+        super(MaxUnPool1D, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.data_format = data_format
+        self.output_size = output_size
+        self.name = name
+
+    def forward(self, x, indices):
+        return F.max_unpool1d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name)
+
+    def extra_repr(self):
+        return 'output_size={}'.format(self.output_size)
+
+
 class MaxUnPool2D(Layer):
     """
     This API implements max unpooling 2d opereation.
@@ -1214,3 +1296,92 @@ def forward(self, x, indices):
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
+
+
+class MaxUnPool3D(Layer):
+    """
+    This API implements max unpooling 3d opereation.
+
+    `max_unpool3d` accepts the output of `max_pool3d` as input, 
+    including the indices of the maximum value and calculate the partial inverse. 
+    All non-maximum values ​​are set to zero.
+
+    - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
+    - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+    
+    .. math::
+        D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
+
+    .. math::
+        H_{out} = (H_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1]
+
+    .. math::
+        W_{out} = (W_{in} - 1) * stride[2] - 2 * padding[2] + kernel\_size[2]
+
+    or as given by :attr:`output_size` in the call operator
+
+    
+    Parameters:
+        kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
+            it must contain an integer.
+        stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
+            it must contain an integer.
+        padding (int | tuple): Padding that was added to the input.
+        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+                           the actual output shape will be automatically calculated by (input_shape,
+                           kernel_size, stride, padding).
+        data_format (string): The data format of the input and output data.
+                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+
+    Returns:
+        A callable object of MaxUnPool3D.
+
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            data = paddle.rand(shape=[1, 1, 4, 4, 6])
+            pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
+            # pool_out shape: [1, 1, 2, 2, 3],  indices shape: [1, 1, 2, 2, 3]
+            Unpool3D = paddle.nn.MaxUnPool3D(kernel_size=2, padding=0)
+            unpool_out = Unpool3D(pool_out, indices)
+            # unpool_out shape: [1, 1, 4, 4, 6]
+
+    """
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 data_format="NCDHW",
+                 output_size=None,
+                 name=None):
+        super(MaxUnPool3D, self).__init__()
+        self.ksize = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.data_format = data_format
+        self.output_size = output_size
+        self.name = name
+
+    def forward(self, x, indices):
+        return F.max_unpool3d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name)
+
+    def extra_repr(self):
+        return 'output_size={}'.format(self.output_size)
diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py
index b6801cfe3208d..8f9b55d15cad0 100644
--- a/python/paddle/nn/utils/__init__.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -14,7 +14,8 @@
 
 from .spectral_norm_hook import spectral_norm
 from .weight_norm_hook import weight_norm, remove_weight_norm  # noqa: F401
+from .transform_parameters import parameters_to_vector, vector_to_parameters  # noqa: F401
 
 __all__ = [  #noqa
-    'weight_norm', 'remove_weight_norm', 'spectral_norm'
+    'weight_norm', 'remove_weight_norm', 'spectral_norm', 'parameters_to_vector', 'vector_to_parameters'
 ]
diff --git a/python/paddle/nn/utils/transform_parameters.py b/python/paddle/nn/utils/transform_parameters.py
new file mode 100644
index 0000000000000..03d2fa514869d
--- /dev/null
+++ b/python/paddle/nn/utils/transform_parameters.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import reduce
+
+import paddle
+from paddle.fluid.framework import dygraph_only, _dygraph_tracer, _varbase_creator
+from paddle import _C_ops
+
+
+#input==output, inplace strategy of reshape has no cost almostly
+def _inplace_reshape_dygraph(x, shape):
+    x_shape = _varbase_creator(dtype=x.dtype)
+    _dygraph_tracer().trace_op(
+        type="reshape2",
+        inputs={'X': x},
+        outputs={'Out': x,
+                 'XShape': x_shape},
+        attrs={'shape': shape},
+        stop_gradient=True)
+
+
+@dygraph_only
+def parameters_to_vector(parameters, name=None):
+    """
+    Flatten parameters to a 1-D Tensor.
+
+    Args:
+        parameters(Iterable[Tensor]): Iterable Tensors that are trainable parameters of a Layer.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A 1-D Tensor, which represents the parameters of a Layer.
+    
+
+    Examples:
+       .. code-block:: python
+
+            import paddle
+            linear = paddle.nn.Linear(10, 15)
+
+            paddle.nn.utils.parameters_to_vector(linear.parameters())
+            # 1-D Tensor: [165]
+
+    """
+    dtype = parameters[0].dtype
+    origin_shapes = []
+    for param in parameters:
+        origin_shapes.append(param.shape)
+        _inplace_reshape_dygraph(param, [-1])
+
+    out = _varbase_creator(dtype=dtype)
+    _dygraph_tracer().trace_op(
+        type='concat',
+        inputs={'X': parameters},
+        outputs={'Out': [out]},
+        attrs={'axis': 0},
+        stop_gradient=True)
+    for i, param in enumerate(parameters):
+        _inplace_reshape_dygraph(param, origin_shapes[i])
+    return out
+
+
+@dygraph_only
+def vector_to_parameters(vec, parameters, name=None):
+    """
+    Transform a 1-D Tensor to the input ``parameters`` .
+
+    Args:
+        vec (Tensor): A 1-D Tensor, which will be sliced and copied to the input ``parameters`` .
+        parameters (Iterable[Tensor]): Iterable Tensors that are trainable parameters of a Layer.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+       .. code-block:: python
+
+            import paddle
+            weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(3.))
+            linear1 = paddle.nn.Linear(10, 15, weight_attr)
+
+            vec = paddle.nn.utils.parameters_to_vector(linear1.parameters())
+
+            linear2 = paddle.nn.Linear(10, 15)
+            # copy weight of linear1 to linear2
+            paddle.nn.utils.vector_to_parameters(vec, linear2.parameters())
+            # weight: Tensor(shape=[10, 15], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+            #                 [[3. , ..., 3. ],
+            #                  [..., ..., ...],
+            #                  [3. , ..., 3. ]])
+    """
+    origin_shapes = []
+    sections = []
+    for param in parameters:
+        shape = param.shape
+        origin_shapes.append(shape)
+        numel = reduce(lambda x, y: x * y, shape)
+        sections.append(numel)
+
+    _dygraph_tracer().trace_op(
+        type='split',
+        inputs={'X': [vec]},
+        outputs={'Out': parameters},
+        attrs={'axis': 0,
+               'sections': sections},
+        stop_gradient=True)
+
+    for i, param in enumerate(parameters):
+        _inplace_reshape_dygraph(param, origin_shapes[i])
+    return
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index cc28eead522d4..8134c9f71b669 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -92,6 +92,7 @@ class Adam(Optimizer):
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
         multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
+        use_multi_tensor (bool, optional): Whether to use multi-tensor strategy to update all parameters at once . Default is false.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
@@ -172,6 +173,7 @@ def __init__(self,
                  grad_clip=None,
                  lazy_mode=False,
                  multi_precision=False,
+                 use_multi_tensor=False,
                  name=None):
         assert learning_rate is not None
         assert beta1 is not None
@@ -209,6 +211,24 @@ def __init__(self,
             'lazy_mode': lazy_mode,
         }
 
+        self._use_multi_tensor = use_multi_tensor
+        if self._use_multi_tensor:
+            self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+            self._moment1_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+            self._moment2_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+            self._beta1_pow_acc_dict = {
+                'FP32_LODTensor': [],
+                'FP16_LODTensor': []
+            }
+            self._beta2_pow_acc_dict = {
+                'FP32_LODTensor': [],
+                'FP16_LODTensor': []
+            }
+            self._master_weight_dict = {
+                'FP32_LODTensor': None,
+                'FP16_LODTensor': []
+            }
+
     def _create_master_weight(self, param):
         if param.name in self._master_weights:
             var = self._master_weights[param.name]
@@ -436,6 +456,157 @@ def step(self):
                 self._apply_optimize(
                     loss=None, startup_program=None, params_grads=params_grads)
 
+    def _multi_tensor_init(self, target_block, parameters):
+        """
+        All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
+        This function will be overridden in the corresponding optimizer file.
+        Args:
+            target_block: the block in which the loss tensor is present
+            parameters: list of parameter tensors for the optimizer
+        """
+        self._create_accumulators(target_block, parameters)
+        for param in parameters:
+            moment1 = self._get_accumulator(self._moment1_acc_str, param)
+            moment2 = self._get_accumulator(self._moment2_acc_str, param)
+            beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                  param)
+            beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                                  param)
+
+            if param.dtype == paddle.float32:
+                self._param_dict['FP32_LODTensor'].append(param)
+                self._moment1_dict['FP32_LODTensor'].append(moment1)
+                self._moment2_dict['FP32_LODTensor'].append(moment2)
+                self._beta1_pow_acc_dict['FP32_LODTensor'].append(beta1_pow_acc)
+                self._beta2_pow_acc_dict['FP32_LODTensor'].append(beta2_pow_acc)
+            elif param.dtype == paddle.float16:
+                self._param_dict['FP16_LODTensor'].append(param)
+                self._moment1_dict['FP16_LODTensor'].append(moment1)
+                self._moment2_dict['FP16_LODTensor'].append(moment2)
+                self._beta1_pow_acc_dict['FP16_LODTensor'].append(beta1_pow_acc)
+                self._beta2_pow_acc_dict['FP16_LODTensor'].append(beta2_pow_acc)
+                if self._multi_precision:
+                    self._master_weight_dict['FP16_LODTensor'].append(
+                        self._master_weights[param.name])
+                else:
+                    self._master_weight_dict['FP16_LODTensor'] = None
+            else:
+                raise ValueError(
+                    "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
+                )
+
+    def _append_optimize_multi_tensor_op(self, target_block,
+                                         parameters_and_grads):
+        """ 
+        For Multi Tensor, append optimize merged_operator to block.
+        """
+        assert isinstance(target_block, framework.Block)
+
+        grad_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+        lr_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
+
+        if isinstance(parameters_and_grads, list):
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].stop_gradient is False:
+                    if param_and_grad[
+                            0].dtype == paddle.float32 and param_and_grad[
+                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                        grad_dict['FP32_LODTensor'].append(param_and_grad[1])
+                        lr = self._create_param_lr(param_and_grad)
+                        lr_dict['FP32_LODTensor'].append(lr)
+                    elif param_and_grad[
+                            0].dtype == paddle.float16 and param_and_grad[
+                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                        grad_dict['FP16_LODTensor'].append(param_and_grad[1])
+                        lr = self._create_param_lr(param_and_grad)
+                        lr_dict['FP16_LODTensor'].append(lr)
+        else:
+            for param_and_grad in parameters_and_grads['params']:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].stop_gradient is False:
+                    param_grad_dict = dict()
+                    param_grad_dict['params'] = param_and_grad
+                    param_grad_dict.update({
+                        k: v
+                        for k, v in parameters_and_grads.items()
+                        if k != 'params'
+                    })
+                    param_and_grad = self._update_param_group(param_grad_dict)
+                    if param_and_grad[
+                            0].dtype == paddle.float32 and param_and_grad[
+                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                        grad_dict['FP32_LODTensor'].append(param_and_grad[1])
+                        lr = self._create_param_lr(param_and_grad)
+                        lr_dict['FP32_LODTensor'].append(lr)
+                    elif param_and_grad[
+                            0].dtype == paddle.float16 and param_and_grad[
+                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                        grad_dict['FP16_LODTensor'].append(param_and_grad[1])
+                        lr = self._create_param_lr(param_and_grad)
+                        lr_dict['FP16_LODTensor'].append(lr)
+
+        multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
+        for key in multi_tensor_list:
+            if len(self._param_dict[key]) > 0:
+                if key == 'FP32_LODTensor':
+                    self._multi_precision = False
+
+                _beta1 = self._beta1 if not isinstance(
+                    self._beta1, Variable) else self._beta1.numpy().item(0)
+                _beta2 = self._beta2 if not isinstance(
+                    self._beta2, Variable) else self._beta2.numpy().item(0)
+
+                if framework.in_dygraph_mode():
+                    _, _, _, _, _, _ = _C_ops.merged_adam(
+                        self._param_dict[key], grad_dict[key], lr_dict[key],
+                        self._moment1_dict[key], self._moment2_dict[key],
+                        self._beta1_pow_acc_dict[key],
+                        self._beta2_pow_acc_dict[key],
+                        self._master_weight_dict[key], self._param_dict[key],
+                        self._moment1_dict[key], self._moment2_dict[key],
+                        self._beta1_pow_acc_dict[key],
+                        self._beta2_pow_acc_dict[key],
+                        self._master_weight_dict[key], 'epsilon', self._epsilon,
+                        'beta1', _beta1, 'beta2', _beta2, 'multi_precision',
+                        self._multi_precision)
+                else:
+                    inputs = {
+                        "Param": self._param_dict[key],
+                        "Grad": grad_dict[key],
+                        "LearningRate": lr_dict[key],
+                        "Moment1": self._moment1_dict[key],
+                        "Moment2": self._moment2_dict[key],
+                        "Beta1Pow": self._beta1_pow_acc_dict[key],
+                        "Beta2Pow": self._beta2_pow_acc_dict[key]
+                    }
+                    outputs = {
+                        "ParamOut": self._param_dict[key],
+                        "Moment1Out": self._moment1_dict[key],
+                        "Moment2Out": self._moment2_dict[key],
+                        "Beta1PowOut": self._beta1_pow_acc_dict[key],
+                        "Beta2PowOut": self._beta2_pow_acc_dict[key]
+                    }
+                    attrs = {
+                        "epsilon": self._epsilon,
+                        "beta1": _beta1,
+                        "beta2": _beta2
+                    }
+                    if self._multi_precision:
+                        inputs["MasterParam"] = self._master_weight_dict[key]
+                        outputs["MasterParamOut"] = self._master_weight_dict[
+                            key]
+                        attrs["multi_precision"] = self._multi_precision
+                    target_block.append_op(
+                        type="merged_adam",
+                        inputs=inputs,
+                        outputs=outputs,
+                        attrs=attrs,
+                        stop_gradient=True)
+        return None
+
     def _update_param_group(self, parameters):
         self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
         self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index be1786696bd92..d4fafba9229b0 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -17,7 +17,7 @@
 import warnings
 from paddle import Tensor
 
-__all__ = [ #noqa
+__all__ = [  # noqa
     'LRScheduler',
     'NoamDecay',
     'PiecewiseDecay',
@@ -30,7 +30,8 @@
     'StepDecay',
     'LambdaDecay',
     'ReduceOnPlateau',
-    'CosineAnnealingDecay'
+    'CosineAnnealingDecay',
+    'MultiplicativeDecay'
 ]
 
 
@@ -55,9 +56,9 @@ class LRScheduler(object):
 
     Examples:
         Here is an example of a simple ``StepDecay`` implementation. 
-        
+
         .. code-block:: python
-            
+
             import paddle
             from paddle.optimizer.lr import LRScheduler
 
@@ -99,7 +100,7 @@ def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
         self.step()
 
     def __call__(self):
-        """ 
+        """
         Return lastest computed learning rate on current epoch.
         """
         return self.last_lr
@@ -107,7 +108,7 @@ def __call__(self):
     def step(self, epoch=None):
         """
 
-        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .  
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
         The new learning rate will take effect on next ``optimizer.step`` .
 
         Args:
@@ -191,7 +192,7 @@ def set_state_dict(self, state_dict):
 
     def get_lr(self):
         """
-        
+
         For those subclass who overload ``LRScheduler`` (Base Class), User should have a custom implementation of ``get_lr()`` .
 
         Otherwise, an ``NotImplementedError`` exception will be thrown.
@@ -203,7 +204,7 @@ def get_lr(self):
 class NoamDecay(LRScheduler):
     r"""
 
-    Applies Noam Decay to the initial learning rate. 
+    Applies Noam Decay to the initial learning rate.
 
     The algorithm can be described as following.
 
@@ -211,7 +212,7 @@ class NoamDecay(LRScheduler):
 
         new\_learning\_rate = learning\_rate * d_{model}^{-0.5} * min(epoch^{-0.5}, epoch * warmup\_steps^{-1.5})
 
-    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_ 
+    Please reference `attention is all you need <https://arxiv.org/pdf/1706.03762.pdf>`_
 
 
     Args:
@@ -312,8 +313,8 @@ class PiecewiseDecay(LRScheduler):
             learning_rate = 0.1
 
     Args:
-        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int. 
-        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries. 
+        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int.
+        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries.
             The type of element in the list is python float.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -322,7 +323,7 @@ class PiecewiseDecay(LRScheduler):
         ``PiecewiseDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -388,7 +389,7 @@ class NaturalExpDecay(LRScheduler):
     r"""
 
     Applies natural exponential decay to the initial learning rate.
-    
+
     The algorithm can be described as following:
 
     .. math::
@@ -405,7 +406,7 @@ class NaturalExpDecay(LRScheduler):
         ``NaturalExpDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -476,7 +477,7 @@ class InverseTimeDecay(LRScheduler):
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -485,7 +486,7 @@ class InverseTimeDecay(LRScheduler):
         ``InverseTimeDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -555,7 +556,7 @@ class PolynomialDecay(LRScheduler):
 
     .. math::
 
-        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps}) 
+        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps})
 
         new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
 
@@ -563,7 +564,7 @@ class PolynomialDecay(LRScheduler):
 
     .. math::
 
-        epoch & = min(epoch, decay\_steps) 
+        epoch & = min(epoch, decay\_steps)
 
         new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
 
@@ -573,7 +574,7 @@ class PolynomialDecay(LRScheduler):
         decay_steps(int): The decay step size. It determines the decay cycle. It must be a positive integer.
         end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
         power(float, optional): Power of polynomial. Default: 1.0.
-        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease 
+        cycle(bool, optional): Whether the learning rate rises again. If True, then the learning rate will rise when it decrease
             to ``end_lr`` .  If False, the learning rate is monotone decreasing. Default: False.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -582,7 +583,7 @@ class PolynomialDecay(LRScheduler):
         ``PolynomialDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -671,21 +672,21 @@ class LinearWarmup(LRScheduler):
 
     Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
-    
+
     When epoch < warmup_steps, learning rate is updated as:
-    
+
     .. math::
-    
+
             lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
-    
+
     where start_lr is the initial learning rate, and end_lr is the final learning rate;
-    
+
     When epoch >= warmup_steps, learning rate is updated as:
-    
+
     .. math::
-    
+
             lr = learning_rate
-    
+
     where ``learning_rate`` is float or any subclass of ``LRScheduler`` .
 
     Args:
@@ -700,7 +701,7 @@ class LinearWarmup(LRScheduler):
         ``LinearWarmup`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -811,14 +812,14 @@ class ExponentialDecay(LRScheduler):
     Update learning rate by `gamma` each epoch.
 
     The algorithm can be described as following.
-    
+
     .. math::
 
         new\_learning\_rate = last\_learning\_rate * gamma
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
-        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+        gamma (float): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
             It should be less than 1.0.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -827,7 +828,7 @@ class ExponentialDecay(LRScheduler):
         ``ExponentialDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -889,7 +890,7 @@ class MultiStepDecay(LRScheduler):
     """
     Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -906,17 +907,17 @@ class MultiStepDecay(LRScheduler):
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
         milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
-        
+
 
     Returns:
         ``MultiStepDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -999,7 +1000,7 @@ class StepDecay(LRScheduler):
     """
     Update the learning rate of ``optimizer`` by ``gamma`` every ``step_size`` number of epoch.
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -1015,7 +1016,7 @@ class StepDecay(LRScheduler):
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
         step_size (int): the interval to update. It must be a positive integer.
-        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . 
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
             It should be less than 1.0. Default: 0.1.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -1025,7 +1026,7 @@ class StepDecay(LRScheduler):
 
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -1102,7 +1103,7 @@ class LambdaDecay(LRScheduler):
     """
     Sets the learning rate of ``optimizer`` by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
 
-    The algorithm can be described as the code below. 
+    The algorithm can be described as the code below.
 
     .. code-block:: text
 
@@ -1118,12 +1119,12 @@ class LambdaDecay(LRScheduler):
         lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the initial learning rate by this factor.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
-    
+
     Returns:
         ``LambdaDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -1188,37 +1189,37 @@ def get_lr(self):
 
 class ReduceOnPlateau(LRScheduler):
     """
-    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate 
+    Reduce learning rate when ``metrics`` has stopped descending. Models often benefit from reducing the learning rate
     by 2 to 10 times once model performance has no longer improvement.
 
-    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics`` 
-    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` . 
-    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience`` 
+    The ``metrics`` is the one which has been pass into ``step`` , it must be 1-D Tensor with shape [1]. When ``metrics``
+    stop descending for a ``patience`` number of epochs, the learning rate will be reduced to ``learning_rate * factor`` .
+    (Specially, ``mode`` can also be set to ``'max`` , in this case, when ``metrics`` stop ascending for a ``patience``
     number of epochs, the learning rate will be reduced.)
 
     In addition, After each reduction, it will wait a ``cooldown`` number of epochs before resuming above operation.
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
-        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the 
-            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning 
+        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
+            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'`` ,  the learning
             rate will reduce when ``loss`` stops ascending. Default: ``'min'`` .
-        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` . 
+        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
             It should be less than 1.0. Default: 0.1.
-        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced. 
+        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
             Default: 10.
-        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` . 
+        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
             This make tiny changes of ``loss`` will be ignored. Default: 1e-4.
         threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
-            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum 
+            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
             change of ``loss`` is ``threshold`` . Default: ``'rel'`` .
         cooldown (int, optional): The number of epochs to wait before resuming normal operation. Default: 0.
         min_lr (float, optional): The lower bound of the learning rate after reduction. Default: 0.
-        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon, 
+        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
             the update is ignored. Default: 1e-8.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False``.
 
-    
+
     Returns:
         ``ReduceOnPlateau`` instance to schedule learning rate.
 
@@ -1331,18 +1332,18 @@ def state_keys(self):
 
     def step(self, metrics, epoch=None):
         """
-        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .  
+        step should be called after `optimizer.step()` . It will update the learning rate in optimizer according to ``metrics`` .
         The new learning rate will take effect on next epoch.
 
         Args:
-            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce. 
+            metrics (Tensor|numpy.ndarray|float): Which will be monitored to determine whether the learning rate will reduce.
                 If it stop descending for a ``patience`` number of epochs, the learning rate will reduce. If it's 'Tensor' or
                 'numpy.ndarray', its shape must be [1].
             epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
 
         Returns:
             None
-        
+
         Examples:
             Please refer to the example of current LRScheduler.
         """
@@ -1354,8 +1355,9 @@ def step(self, metrics, epoch=None):
         # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
         if isinstance(metrics, (Tensor, numpy.ndarray)):
             assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
-                "should be (1L,), but the current metrics.shape is {}. Maybe that "  \
-                "you should call paddle.mean to process it first.".format(metrics.shape)
+                                                                      "should be (1L,), but the current metrics.shape is {}. Maybe that " \
+                                                                      "you should call paddle.mean to process it first.".format(
+                metrics.shape)
         elif not isinstance(metrics,
                             (int, float, numpy.float32, numpy.float64)):
             raise TypeError(
@@ -1399,8 +1401,8 @@ def _is_better(self, current, best):
 class CosineAnnealingDecay(LRScheduler):
     r"""
 
-    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
-    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 
+    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to
+    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in
     SGDR.
 
     The algorithm can be described as following.
@@ -1409,15 +1411,15 @@ class CosineAnnealingDecay(LRScheduler):
 
         \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
         + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
-        & T_{cur} \neq (2k+1)T_{max}; 
+        & T_{cur} \neq (2k+1)T_{max};
 
         \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
         \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
         & T_{cur} = (2k+1)T_{max}.
-    
-    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_. 
+
+    It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_.
     Note that this only implements the cosine annealing part of SGDR, and not the restarts.
-    
+
     Args:
         learning_rate (float): The initial learning rate, that is :math:`\eta_{max}` . It can be set to python float or int number.
         T_max (int): Maximum number of iterations. It is half of the decay cycle of learning rate. It must be a positive integer.
@@ -1429,7 +1431,7 @@ class CosineAnnealingDecay(LRScheduler):
         ``CosineAnnealingDecay`` instance to schedule learning rate.
 
     Examples:
-        
+
         .. code-block:: python
 
             import paddle
@@ -1513,3 +1515,68 @@ def get_lr(self):
     def _get_closed_form_lr(self):
         return self.eta_min + (self.base_lr - self.eta_min) * (1 + math.cos(
             math.pi * self.last_epoch / self.T_max)) / 2
+
+
+class MultiplicativeDecay(LRScheduler):
+    """
+    Multiply the learning rate of ``optimizer`` by the factor given in function ``lr_lambda`` .
+
+    The algorithm can be described as the code below.
+
+    .. code-block:: text
+
+        learning_rate = 0.5        # init learning_rate
+        lr_lambda = lambda epoch: 0.95
+
+        learning_rate = 0.5        # epoch 0,
+        learning_rate = 0.475      # epoch 1, 0.5*0.95
+        learning_rate = 0.45125    # epoch 2, 0.475*0.95
+
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        lr_lambda (function): A function which computes a factor by ``epoch`` , and then multiply the last learning rate by this factor.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``MultiplicativeDecay`` instance to schedule learning rate.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dynamic graph mode
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr.MultiplicativeDecay(learning_rate=0.5, lr_lambda=lambda x:0.95, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(5):
+                    x = paddle.uniform([10, 10])
+                    out = linear(x)
+                    loss = paddle.mean(out)
+                    loss.backward()
+                    sgd.step()
+                    sgd.clear_gradients()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
+
+    """
+
+    def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
+        if not callable(lr_lambda):
+            raise TypeError(
+                "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
+                % type(lr_lambda))
+
+        self.lr_lambda = lr_lambda
+        super(MultiplicativeDecay, self).__init__(learning_rate, last_epoch,
+                                                  verbose)
+
+    def get_lr(self):
+        if self.last_epoch > 0:
+            return self.last_lr * self.lr_lambda(self.last_epoch)
+        else:
+            return self.base_lr
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 65425df72af9d..ada6b06eb6842 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -192,7 +192,7 @@ def __init__(self,
 
     def _update_regularization(self, weight_decay):
         reg_method = ""
-        reg_coeff = 0
+        reg_coeff = 0.0
 
         if (isinstance(weight_decay, L2DecayRegularizer)):
             reg_method = "l2_decay"
@@ -306,7 +306,7 @@ def _append_optimize_op(self, block, param_and_grad):
             # the param's regularization has been done before, we avoid do l2decay in momentum.
             elif param.regularizer is not None:
                 regularization_method = ""
-                regularization_coeff = 0
+                regularization_coeff = 0.0
 
         find_master = self._multi_precision and param_and_grad[
             0].dtype == core.VarDesc.VarType.FP16
@@ -380,7 +380,7 @@ def _multi_tensor_init(self, target_block, parameters):
                 if isinstance(param.regularizer, L2DecayRegularizer):
                     regularization_method = "l2_decay"
                     regularization_coeff = param.regularizer._regularization_coeff
-                else:
+                elif param.regularizer is not None:
                     regularization_method = ""
                     regularization_coeff = 0.0
             if param.dtype == paddle.float32:
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index a711d98df6fa1..3fc70449d15c9 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -218,7 +218,7 @@ def __init__(self,
             self._param_groups = self._parameter_list
 
         # NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode.
-        # Optimizer support list: [ paddle.optimizer.Momentum ].
+        # Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam].
         self._use_multi_tensor = None
         self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
 
@@ -684,8 +684,10 @@ def _create_optimization_pass(self, parameters_and_grads):
 
         self._create_global_learning_rate()
 
-        # NOTE: Multi Tensor support [ Momentum ] for dygraph mode
-        if self._use_multi_tensor and self.__class__.__name__ in ['Momentum']:
+        # NOTE: Multi Tensor support [ Momentum, Adam ] for dygraph mode
+        if self._use_multi_tensor and self.__class__.__name__ in [
+                'Momentum', 'Adam'
+        ]:
             if len(self._param_dict['FP32_LODTensor']) == 0 and len(
                     self._param_dict['FP16_LODTensor']) == 0:
                 if isinstance(parameters_and_grads, list):
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index aab8f30881759..5167c18de179d 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -18,6 +18,10 @@
 from ..fluid.framework import Variable, name_scope
 from ..fluid.dygraph import no_grad
 from paddle import _C_ops
+import warnings
+from ..fluid.layer_helper import LayerHelper
+from ..fluid import unique_name
+from ..fluid import layers
 
 __all__ = []
 
@@ -75,6 +79,7 @@ def __init__(self,
                  parameters=None,
                  weight_decay=None,
                  grad_clip=None,
+                 multi_precision=False,
                  name=None):
         if learning_rate is None:
             raise ValueError("learning_rate is not set")
@@ -85,27 +90,88 @@ def __init__(self,
             grad_clip=grad_clip,
             name=name)
         self.type = "sgd"
+        self._multi_precision = multi_precision
+        self._master_weights = {}
+
+    def _create_master_weight(self, param):
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
+        else:
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
+        return var
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+                master_p = self._create_master_weight(p)
+                continue
+            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+                warnings.warn(
+                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Consider using multi_precision=True option of the Adam optimizer."
+                )
 
     @no_grad
     def _append_optimize_op(self, block, param_and_grad):
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
+
+        find_master = self._multi_precision and param_and_grad[
+            0].dtype == core.VarDesc.VarType.FP16
+        master_weight = (self._master_weights[param_and_grad[0].name]
+                         if find_master else None)
+
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
-            _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1],
-                       param_and_grad[0])
+            _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight,
+                       param_and_grad[0], master_weight)
             return None
 
         assert isinstance(block, framework.Block)
         # create the optimize op
+        inputs = {
+            "Param": param_and_grad[0],
+            "Grad": param_and_grad[1],
+            "LearningRate": lr
+        }
+
+        outputs = {"ParamOut": param_and_grad[0]}
+
+        attrs = {"multi_precision": find_master}
+
+        if find_master:
+            inputs["MasterParam"] = master_weight
+            outputs["MasterParamOut"] = master_weight
+
         sgd_op = block.append_op(
             type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": lr
-            },
-            outputs={"ParamOut": param_and_grad[0]},
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
             stop_gradient=True)
 
         return sgd_op
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 92aa5000dfa58..f18b77997a5e2 100755
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -45,6 +45,7 @@
 from ..fluid.framework import cpu_places  # noqa: F401
 from ..fluid.framework import cuda_places  # noqa: F401
 from ..fluid.framework import xpu_places  # noqa: F401
+from ..fluid.framework import mlu_places  # noqa: F401
 from ..fluid.framework import npu_places  # noqa: F401
 from ..fluid.framework import Variable  # noqa: F401
 from ..fluid.layers.control_flow import Print  # noqa: F401
@@ -103,6 +104,7 @@
            'cuda_places',
            'xpu_places',
            'npu_places',
+           'mlu_places',
            'Variable',
            'create_global_var',
            'accuracy',
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 314938ad7321f..32902029b8a47 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -18,6 +18,7 @@
 from .attribute import shape  # noqa: F401
 from .attribute import real  # noqa: F401
 from .attribute import imag  # noqa: F401
+from .attribute import is_floating_point  # noqa: F401
 from .creation import to_tensor  # noqa: F401
 from .creation import diag  # noqa: F401
 from .creation import diagflat  # noqa: F401
@@ -38,9 +39,11 @@
 from .creation import complex  # noqa: F401
 from .linalg import matmul  # noqa: F401
 from .linalg import dot  # noqa: F401
+from .linalg import cov  # noqa: F401
 from .linalg import norm  # noqa: F401
 from .linalg import cond  # noqa: F401
 from .linalg import transpose  # noqa: F401
+from .linalg import lstsq  # noqa: F401
 from .linalg import dist  # noqa: F401
 from .linalg import t  # noqa: F401
 from .linalg import cross  # noqa: F401
@@ -59,6 +62,9 @@
 from .linalg import eigh  # noqa: F401
 from .linalg import pinv  # noqa: F401
 from .linalg import solve  # noqa: F401
+from .linalg import cholesky_solve  # noqa: F401
+from .linalg import lu  # noqa: F401
+from .linalg import lu_unpack  # noqa: F401
 from .logic import equal  # noqa: F401
 from .logic import greater_equal  # noqa: F401
 from .logic import greater_than  # noqa: F401
@@ -115,6 +121,9 @@
 from .manipulation import chunk  # noqa: F401
 from .manipulation import tensordot  # noqa: F401
 from .manipulation import as_complex  # noqa: F401
+from .manipulation import take_along_axis  # noqa: F401
+from .manipulation import put_along_axis  # noqa: F401
+from .manipulation import put_along_axis_  # noqa: F401
 from .manipulation import as_real  # noqa: F401
 from .manipulation import moveaxis  # noqa: F401
 from .manipulation import repeat_interleave  # noqa: F401
@@ -155,12 +164,15 @@
 from .math import square  # noqa: F401
 from .math import stanh  # noqa: F401
 from .math import sum  # noqa: F401
+from .math import nansum  # noqa: F401
 from .math import tanh  # noqa: F401
 from .math import tanh_  # noqa: F401
 from .math import add_n  # noqa: F401
 from .math import max  # noqa: F401
+from .math import amax  # noqa: F401
 from .math import maximum  # noqa: F401
 from .math import min  # noqa: F401
+from .math import amin  # noqa: F401
 from .math import minimum  # noqa: F401
 from .math import mm  # noqa: F401
 from .math import divide  # noqa: F401
@@ -203,6 +215,8 @@
 from .math import atanh  # noqa: F401
 from .math import lerp  # noqa: F401
 from .math import lerp_  # noqa: F401
+from .math import erfinv  # noqa: F401
+from .math import erfinv_  # noqa: F401
 from .math import rad2deg  # noqa: F401
 from .math import deg2rad  # noqa: F401
 from .math import gcd  # noqa: F401
@@ -211,6 +225,8 @@
 from .math import angle  # noqa: F401
 from .math import fmax  # noqa: F401
 from .math import fmin  # noqa: F401
+from .math import inner  # noqa: F401
+from .math import outer  # noqa: F401
 
 from .random import multinomial  # noqa: F401
 from .random import standard_normal  # noqa: F401
@@ -222,6 +238,8 @@
 from .random import randint  # noqa: F401
 from .random import randint_like  # noqa: F401
 from .random import randperm  # noqa: F401
+from .random import poisson  # noqa: F401
+from .random import exponential_  # noqa: F401
 from .search import argmax  # noqa: F401
 from .search import argmin  # noqa: F401
 from .search import argsort  # noqa: F401
@@ -233,11 +251,16 @@
 from .search import sort  # noqa: F401
 from .search import index_sample  # noqa: F401
 from .search import masked_select  # noqa: F401
+from .search import kthvalue  # noqa: F401
+from .search import mode  # noqa: F401
+
 from .stat import mean  # noqa: F401
 from .stat import std  # noqa: F401
 from .stat import var  # noqa: F401
 from .stat import numel  # noqa: F401
 from .stat import median  # noqa: F401
+from .stat import quantile  # noqa: F401
+
 from .to_string import set_printoptions  # noqa: F401
 
 from .array import array_length  # noqa: F401
@@ -251,9 +274,11 @@
 tensor_method_func  = [ #noqa
            'matmul',
            'dot',
+           'cov',
            'norm',
            'cond',
            'transpose',
+           'lstsq',
            'dist',
            't',
            'cross',
@@ -307,16 +332,21 @@
            'square',
            'stanh',
            'sum',
+           'nansum',
            'tanh',
            'tanh_',
            'add_n',
            'max',
+           'amax',
            'maximum',
            'min',
+           'amin',
            'minimum',
            'fmax',
            'fmin',
            'mm',
+           'inner',
+           'outer',
            'divide',
            'floor_divide',
            'remainder',
@@ -337,6 +367,7 @@
            'clip_',
            'trace',
            'kron',
+           'kthvalue',
            'isfinite',
            'isinf',
            'isnan',
@@ -410,12 +441,14 @@
            'var',
            'numel',
            'median',
+           'quantile',
            'is_complex',
            'is_integer',
            'rank',
            'shape',
            'real',
            'imag',
+           'is_floating_point',
            'digamma',
            'diagonal',
            'trunc',
@@ -428,10 +461,13 @@
            'uniform_',
            'multi_dot',
            'solve',
+           'cholesky_solve',
            'triangular_solve',
            'asinh',
            'atanh',
            'acosh',
+           'lu',
+           'lu_unpack',
            'as_complex',
            'as_real',
            'rad2deg',
@@ -439,11 +475,18 @@
            'gcd',
            'lcm',
            'diff',
+           "mode",
            'lerp',
            'lerp_',
+           'erfinv',
+           'erfinv_',
            'angle',
            'moveaxis',
            'repeat_interleave',
+           'take_along_axis',
+           'put_along_axis',
+           'put_along_axis_',
+           'exponential_',
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index aefe2e4749a37..ee84b43e13fef 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -81,6 +81,30 @@ def is_complex(x):
 
 
 def is_floating_point(x):
+    """
+    Returns whether the dtype of `x` is one of paddle.float64, paddle.float32, paddle.float16, and paddle.bfloat16.
+
+    Args:
+        x (Tensor): The input tensor.
+
+    Returns:
+        bool: True if the dtype of `x` is floating type, otherwise false.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.arange(1., 5., dtype='float32')
+            y = paddle.arange(1, 5, dtype='int32')
+            print(paddle.is_floating_point(x))
+            # True
+            print(paddle.is_floating_point(y))
+            # False
+    """
+    if not isinstance(x, (paddle.Tensor, paddle.static.Variable)):
+        raise TypeError("Expected Tensor, but received type of x: {}".format(
+            type(x)))
     dtype = x.dtype
     is_fp_dtype = (dtype == core.VarDesc.VarType.FP32 or
                    dtype == core.VarDesc.VarType.FP64 or
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 8a376884063f7..cd1faf64f3ea5 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -117,12 +117,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             ) != _current_expected_place()._get_device_id():
         place = _current_expected_place()
 
-    if _in_eager_mode():
-        if dtype is None:
-            dtype = paddle.get_default_dtype()
-        return core.eager.to_tensor(data,
-                                    convert_dtype(dtype), place, stop_gradient)
-
     if not isinstance(data, np.ndarray):
 
         def _handle_dtype(data, dtype):
@@ -172,12 +166,17 @@ def _handle_dtype(data, dtype):
     if dtype and convert_dtype(dtype) != data.dtype:
         data = data.astype(convert_dtype(dtype))
 
-    return paddle.Tensor(
-        value=data,
-        place=place,
-        persistable=False,
-        zero_copy=False,
-        stop_gradient=stop_gradient)
+    # TOOD(jiabin): Support kwargs in eager tensor constructor
+    if _in_eager_mode() and isinstance(data, np.ndarray):
+        return core.eager.EagerTensor(data, place, False, False, None,
+                                      stop_gradient)
+    else:
+        return paddle.Tensor(
+            value=data,
+            place=place,
+            persistable=False,
+            zero_copy=False,
+            stop_gradient=stop_gradient)
 
 
 def full_like(x, fill_value, dtype=None, name=None):
@@ -1158,8 +1157,7 @@ def empty_like(x, dtype=None, name=None):
 
 def assign(x, output=None):
     """
- 
- 
+
     The OP copies the :attr:`x` to the :attr:`output`.
  
     Parameters:
@@ -1192,6 +1190,36 @@ def assign(x, output=None):
     return tensor.assign(x, output)
 
 
+def clone(x, name=None):
+    """
+    Returns a copy of input Tensor. It will always have a Tensor copy. 
+    
+    In addition, This function is derivable, so gradients will flow back from the output to input.
+
+    Parameters:
+        x (Tensor): The input Tensor.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns: A Tensor copied from ``input`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.ones([2])
+            x.stop_gradient = False
+            clone_x = paddle.clone(x)
+
+            y = clone_x**3
+            y.backward()
+            print(clone_x.grad)          # [3]
+            print(x.grad)                # [3]
+    """
+    return x.clone()
+
+
 #NOTE(zhiqiu): not public 
 def _memcpy(input, place=None, output=None):
     """
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index f333b527db38f..5f71606b7dc40 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -23,7 +23,6 @@
 from paddle.common_ops_import import core
 from paddle.common_ops_import import VarDesc
 from paddle import _C_ops
-import paddle
 
 __all__ = []
 
@@ -920,6 +919,119 @@ def dot(x, y, name=None):
     return out
 
 
+def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
+    """
+    Estimate the covariance matrix of the input variables, given data and weights.
+
+    A covariance matrix is a square matrix, indicate the covariance of each pair variables in the input matrix.
+    For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the covariance matrix 
+    element Cij is the covariance of xi and xj. The element Cii is the variance of xi itself.
+
+    Parameters:
+        x(Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below.
+        rowvar(Bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True
+        ddof(Bool, optional): If ddof=True will return the unbiased estimate, and ddof=False will return the simple average. Default: True
+        fweights(Tensor, optional): 1-D Tensor of integer frequency weights; The number of times each observation vector should be repeated. Default: None
+        aweights(Tensor, optional): 1-D Tensor of observation vector weights. How important of the observation vector, larger data means this element is more important. Default: None
+        name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: The covariance matrix Tensor of the variables.
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+
+        xt = paddle.rand((3,4))
+        paddle.linalg.cov(xt)
+
+        '''
+        Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            [[0.07918842, 0.06127326, 0.01493049],
+                [0.06127326, 0.06166256, 0.00302668],
+                [0.01493049, 0.00302668, 0.01632146]])
+        '''
+    """
+    op_type = 'cov'
+    if len(x.shape) > 2 or len(x.shape) < 1:
+        raise ValueError(
+            "Input(x) only support N-D (1<=N<=2) tensor in cov, but received "
+            "length of Input(input) is %s." % len(x.shape))
+    check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'cov')
+    nx = x
+    if len(x.shape) == 1:
+        nx = x.reshape((1, -1))
+    if not rowvar and nx.shape[0] != 1:
+        nx = nx.t()
+    w = None
+    observation_num = nx.shape[1]
+    if fweights is not None:
+        w = fweights.astype(nx.dtype)
+        if len(w.shape) > 1:
+            raise ValueError(
+                "Input(fweights) only support N-D (N<=1) tensor in cov, but received "
+                "shape of Input(input) is %s." % len(fweights.shape))
+        if fweights.shape[0] != observation_num:
+            raise ValueError(
+                "The number of Input(fweights) should equal to x's dim[1]: {}, but received "
+                "size of Input(fweights) is {}.".format(observation_num,
+                                                        fweights.shape[0]))
+        if fweights.min() < 0:
+            raise ValueError(
+                "The value of Input(fweights) cannot be negtive, but received "
+                "min of Input(fweights) is {}.".format(fweights.min()))
+        if not paddle.all(fweights == paddle.round(fweights.astype('float64'))):
+            raise ValueError("Input(fweights) must be integer ")
+
+    if aweights is not None:
+        aw = aweights.astype(nx.dtype)
+        if len(aw.shape) > 1:
+            raise ValueError(
+                "Input(aweights) only support N-D (N<=1) tensor in cov, but received "
+                "length of Input(input) is %s." % len(aweights.shape))
+        check_variable_and_dtype(aweights, 'dtype', ['float32', 'float64'],
+                                 'cov')
+        if aweights.shape[0] != observation_num:
+            raise ValueError(
+                "The number of Input(aweights) should equal to x's dim[1]: {}, but received "
+                "size of Input(aweights) is {}.".format(observation_num,
+                                                        aweights.shape[0]))
+        if aweights.min() < 0:
+            raise ValueError(
+                "The value of Input(aweights) cannot be negtive, but received "
+                "min of Input(aweights) is {}.".format(aweights.min()))
+        if w is not None:
+            w = w * aw
+        else:
+            w = aw
+
+    w_sum = paddle.to_tensor(observation_num, dtype=nx.dtype)
+    if fweights is not None or aweights is not None:
+        w_sum = w.sum()
+        if w_sum.item() == 0:
+            raise ValueError("The sum of weights is zero, can't be normalized.")
+
+    if w is not None:
+        nx_w = nx * w
+        avg = (nx_w).sum(axis=1) / w_sum
+    else:
+        avg = nx.sum(axis=1) / w_sum
+        nx_w = nx
+
+    if w is not None and aweights is not None and ddof == True:
+        norm_factor = w_sum - (w * aweights).sum() / w_sum
+    else:
+        norm_factor = w_sum - ddof
+    if norm_factor <= 0:
+        norm_factor = paddle.to_tensor(0, dtype=nx.dtype)
+    nx = nx - avg.unsqueeze(1)
+    xxt = paddle.mm(nx, nx_w.t().conj())
+    cov = paddle.divide(xxt, norm_factor).squeeze()
+    return cov
+
+
 def t(input, name=None):
     """
     Transpose <=2-D tensor.
@@ -1430,7 +1542,7 @@ def det(x, name=None):
 
     """
     if in_dygraph_mode():
-        return core.ops.determinant(x)
+        return _C_ops.determinant(x)
 
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'det')
 
@@ -1485,7 +1597,7 @@ def slogdet(x, name=None):
 
     """
     if in_dygraph_mode():
-        return core.ops.slogdeterminant(x)
+        return _C_ops.slogdeterminant(x)
 
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'slogdet')
 
@@ -1633,7 +1745,7 @@ def matrix_power(x, n, name=None):
             #  [ 1.80555556 , -1.91666667 ,  0.44444444 ]]
     """
     if in_dygraph_mode():
-        return core.ops.matrix_power(x, "n", n)
+        return _C_ops.matrix_power(x, "n", n)
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'matrix_power')
     check_type(n, 'n', int, 'matrix_power')
@@ -1711,6 +1823,205 @@ def qr(x, mode="reduced", name=None):
         return q, r
 
 
+def lu(x, pivot=True, get_infos=False, name=None):
+    r"""
+    Computes the LU factorization of an N-D(N>=2) matrix x. 
+
+    Returns the LU factorization(inplace x) and Pivots. low triangular matrix L and 
+    upper triangular matrix U are combined to a single LU matrix.
+
+    Pivoting is done if pivot is set to True.
+    P mat can be get by pivots:
+    # ones = eye(rows) #eye matrix of rank rows
+    # for i in range(cols):
+    #     swap(ones[i], ones[pivots[i]])
+    # return ones
+
+    Args:
+
+        X (Tensor): the tensor to factor of N-dimensions(N>=2).
+
+        pivot (bool, optional): controls whether pivoting is done. Default: True.
+
+        get_infos (bool, optional): if set to True, returns an info IntTensor. Default: False.
+
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+            
+    Returns:
+        factorization (Tensor): LU matrix, the factorization of input X.
+
+        pivots (IntTensor): the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the 
+                    intermediate transpositions of rows. The final permutation `perm` could be 
+                    reconstructed by this, details refer to upper example.
+
+        infos (IntTensor, optional): if `get_infos` is `True`, this is a tensor of size (∗(N-2)) 
+                    where non-zero values indicate whether factorization for the matrix or each minibatch 
+                    has succeeded or failed.
+
+        
+    Examples:            
+        .. code-block:: python
+
+            import paddle 
+
+            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            lu,p,info = paddle.linalg.lu(x, get_infos=True)
+
+            # >>> lu:
+            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #    [[5.        , 6.        ],
+            #        [0.20000000, 0.80000000],
+            #        [0.60000000, 0.50000000]])
+            # >>> p
+            # Tensor(shape=[2], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
+            #    [3, 3])
+            # >>> info
+            # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
+            #    0)
+            
+            P,L,U = paddle.linalg.lu_unpack(lu,p)
+
+            # >>> P
+            # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            # [[0., 1., 0.],
+            # [0., 0., 1.],
+            # [1., 0., 0.]]), 
+            # >>> L
+            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            # [[1.        , 0.        ],
+            # [0.20000000, 1.        ],
+            # [0.60000000, 0.50000000]]), 
+            # >>> U
+            # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            # [[5.        , 6.        ],
+            # [0.        , 0.80000000]]))
+            
+
+            # one can verify : X = P @ L @ U ;     
+    """
+    if in_dygraph_mode():
+        LU, Piv, Info = _C_ops.lu(x, 'pivots', pivot)
+        if get_infos:
+            return LU, Piv, Info
+        else:
+            return LU, Piv
+    check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'lu')
+    helper = LayerHelper('lu', **locals())
+    lu = helper.create_variable_for_type_inference(dtype=x.dtype)
+    p = helper.create_variable_for_type_inference(dtype='int')
+    info = helper.create_variable_for_type_inference(dtype='int')
+    attrs = dict()
+    attrs['pivots'] = pivot
+    helper.append_op(
+        type='lu',
+        inputs={'X': x},
+        outputs={'Out': lu,
+                 'Pivots': p,
+                 'Infos': info},
+        attrs=attrs)
+    if get_infos:
+        return lu, p, info
+    else:
+        return lu, p
+
+
+def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
+    r"""
+    Unpack L U and P to single matrix tensor . 
+    unpack L and U matrix from LU, unpack permutation matrix P from Pivtos .
+
+    P mat can be get by pivots:
+    # ones = eye(rows) #eye matrix of rank rows
+    # for i in range(cols):
+    #     swap(ones[i], ones[pivots[i]])
+
+
+    Args:
+        x (Tensor): The LU tensor get from paddle.linalg.lu, which is combined by L and U.
+
+        y (Tensor): Pivots get from paddle.linalg.lu.
+
+        unpack_ludata (bool,optional): whether to unpack L and U from x. Default: True.
+
+        unpack_pivots (bool, optional): whether to unpack permutation matrix P from Pivtos. Default: True.
+
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+            
+    Returns:
+        P (Tensor): Permutation matrix P of lu factorization.
+
+        L (Tensor): The lower triangular matrix tensor of lu factorization.
+
+        U (Tensor): The upper triangular matrix tensor of lu factorization.
+
+        
+    Examples:            
+        .. code-block:: python
+
+            import paddle 
+
+            x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
+            lu,p,info = paddle.linalg.lu(x, get_infos=True)
+
+            # >>> lu:
+            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #    [[5.        , 6.        ],
+            #        [0.20000000, 0.80000000],
+            #        [0.60000000, 0.50000000]])
+            # >>> p
+            # Tensor(shape=[2], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
+            #    [3, 3])
+            # >>> info
+            # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
+            #    0)
+            
+            P,L,U = paddle.linalg.lu_unpack(lu,p)
+
+            # >>> P
+            # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            # [[0., 1., 0.],
+            # [0., 0., 1.],
+            # [1., 0., 0.]]), 
+            # >>> L
+            # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            # [[1.        , 0.        ],
+            # [0.20000000, 1.        ],
+            # [0.60000000, 0.50000000]]), 
+            # >>> U
+            # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            # [[5.        , 6.        ],
+            # [0.        , 0.80000000]]))
+
+            # one can verify : X = P @ L @ U ;   
+    """
+
+    if in_dygraph_mode():
+        P, L, U = _C_ops.lu_unpack(x, y, 'unpack_ludata', unpack_ludata,
+                                   'unpack_pivots', unpack_pivots)
+        return P, L, U
+
+    check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'lu_unpack')
+    helper = LayerHelper('lu_unpack', **locals())
+    p = helper.create_variable_for_type_inference(dtype=x.dtype)
+    l = helper.create_variable_for_type_inference(dtype=x.dtype)
+    u = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    attrs = dict()
+    attrs['unpack_ludata'] = unpack_ludata
+    attrs['unpack_pivots'] = unpack_pivots
+    helper.append_op(
+        type='lu_unpack',
+        inputs={'X': x,
+                'Pivots': y},
+        outputs={'Pmat': p,
+                 'L': l,
+                 'U': u},
+        attrs=attrs)
+    return p, l, u
+
+
 def eig(x, name=None):
     """
     This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices.
@@ -2388,6 +2699,56 @@ def triangular_solve(x,
     return out
 
 
+def cholesky_solve(x, y, upper=False, name=None):
+    r"""
+    Solves a linear system of equations A @ X = B, given A's Cholesky factor matrix u and  matrix B.
+
+    Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs
+    is also batches.
+
+    Args:
+        x (Tensor): The input matrix which is upper or lower triangular Cholesky factor of square matrix A. Its shape should be `[*, M, M]`, where `*` is zero or
+            more batch dimensions. Its data type should be float32 or float64.
+        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is 
+            zero or more batch dimensions. Its data type should be float32 or float64.
+        upper (bool, optional): whether to consider the Cholesky factor as a lower or upper triangular matrix. Default: False.
+        name(str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The solution of the system of equations. Its data type is the same as that of `x`.
+
+    Examples:
+    .. code-block:: python
+
+        import paddle
+
+        u = paddle.to_tensor([[1, 1, 1], 
+                                [0, 2, 1],
+                                [0, 0,-1]], dtype="float64")
+        b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+        out = paddle.linalg.cholesky_solve(b, u, upper=True)
+
+        print(out)
+        # [-2.5, -7, 9.5]
+    """
+    if in_dygraph_mode():
+        return _C_ops.cholesky_solve(x, y, 'upper', upper)
+
+    helper = LayerHelper("cholesky_solve", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'cholesky_solve')
+    check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'cholesky_solve')
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='cholesky_solve',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs={'upper': upper})
+    return out
+
+
 def eigvalsh(x, UPLO='L', name=None):
     """
     Computes the eigenvalues of a 
@@ -2453,3 +2814,107 @@ def __check_input(x, UPLO):
         attrs={'UPLO': UPLO,
                'is_test': is_test})
     return out_value
+
+
+def lstsq(x, y, rcond=1e-15, driver=None, name=None):
+    device = paddle.device.get_device()
+    if device == "cpu":
+        if driver not in (None, "gels", "gelss", "gelsd", "gelsy"):
+            raise ValueError(
+                "Only support valid driver is 'gels', 'gelss', 'gelsd', 'gelsy' or None for CPU inputs. But got {}".
+                format(driver))
+        driver = "gelsy" if driver is None else driver
+    elif "gpu" in device:
+        if driver not in (None, "gels"):
+            raise ValueError(
+                "Only support valid driver is 'gels' or None for CUDA inputs. But got {}".
+                format(driver))
+        driver = "gels" if driver is None else driver
+    else:
+        raise RuntimeError("Only support lstsq api for CPU or CUDA device.")
+
+    if in_dygraph_mode():
+        solution, rank, singular_values = _C_ops.lstsq(x, y, "rcond", rcond,
+                                                       "driver", driver)
+        if x.shape[-2] > x.shape[-1]:
+            matmul_out = _varbase_creator(dtype=x.dtype)
+            _C_ops.matmul(x, solution, matmul_out, 'trans_x', False, 'trans_y',
+                          False)
+            minus_out = _C_ops.elementwise_sub(matmul_out, y)
+            pow_out = _C_ops.pow(minus_out, 'factor', 2)
+            residuals = _C_ops.reduce_sum(pow_out, 'dim', [-2], 'keepdim',
+                                          False, 'reduce_all', False)
+        else:
+            residuals = paddle.empty(shape=[0], dtype=x.dtype)
+
+        if driver == "gels":
+            rank = paddle.empty(shape=[0], dtype=paddle.int32)
+            singular_values = paddle.empty(shape=[0], dtype=x.dtype)
+        elif driver == "gelsy":
+            singular_values = paddle.empty(shape=[0], dtype=x.dtype)
+
+        return solution, residuals, rank, singular_values
+
+    helper = LayerHelper('lstsq', **locals())
+    check_variable_and_dtype(
+        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'lstsq')
+    check_variable_and_dtype(
+        y, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'lstsq')
+
+    solution = helper.create_variable_for_type_inference(dtype=x.dtype)
+    residuals = helper.create_variable_for_type_inference(dtype=x.dtype)
+    rank = helper.create_variable_for_type_inference(dtype=paddle.int32)
+    singular_values = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type='lstsq',
+        inputs={'X': x,
+                'Y': y},
+        outputs={
+            'Solution': solution,
+            'Rank': rank,
+            'SingularValues': singular_values
+        },
+        attrs={'rcond': rcond,
+               'driver': driver})
+
+    matmul_out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    minus_out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    pow_out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='matmul_v2',
+        inputs={'X': x,
+                'Y': solution},
+        outputs={'Out': matmul_out},
+        attrs={
+            'trans_x': False,
+            'trans_y': False,
+        })
+
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': matmul_out,
+                'Y': y},
+        outputs={'Out': minus_out})
+
+    helper.append_op(
+        type='pow',
+        inputs={'X': minus_out},
+        outputs={'Out': pow_out},
+        attrs={'factor': 2})
+
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': pow_out},
+        outputs={'Out': residuals},
+        attrs={'dim': [-2],
+               'keep_dim': False,
+               'reduce_all': False})
+
+    if driver == "gels":
+        rank = paddle.static.data(name='rank', shape=[0])
+        singular_values = paddle.static.data(name='singular_values', shape=[0])
+    elif driver == "gelsy":
+        singular_values = paddle.static.data(name='singular_values', shape=[0])
+
+    return solution, residuals, rank, singular_values
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
old mode 100644
new mode 100755
index 66a990f6dc7bf..a15c1af391f9f
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -70,8 +70,8 @@ def fill_(x, value):
         raise TypeError(
             "The type of 'value'  must be int or float, but received %s." %
             (type(value)))
-    return core.ops.fill_any_(x, "value_float",
-                              float(value), "value_int", int(value))
+    return _C_ops.fill_any_(x, "value_float",
+                            float(value), "value_int", int(value))
 
 
 setattr(core.VarBase, 'fill_', fill_)
@@ -102,7 +102,7 @@ def zero_(x):
             print(tensor.tolist())   #[0, 0, 0, 0, 0]
 
     """
-    return core.ops.fill_any_(x, "value_float", 0., "value_int", int(0))
+    return _C_ops.fill_any_(x, "value_float", 0., "value_int", int(0))
 
 
 setattr(core.VarBase, 'zero_', zero_)
@@ -148,10 +148,10 @@ def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
             'Tensor dims should be equal while input dims > 2 in fill_diagonal_ API'
         )
     if len(inshape) == 2:
-        return core.ops.fill_diagonal_(x, 'value', value, 'offset', offset,
-                                       'wrap', wrap)
-    return core.ops.fill_diagonal_(x, 'value', value, 'offset', offset, 'wrap',
-                                   True)
+        return _C_ops.fill_diagonal_(x, 'value', value, 'offset', offset,
+                                     'wrap', wrap)
+    return _C_ops.fill_diagonal_(x, 'value', value, 'offset', offset, 'wrap',
+                                 True)
 
 
 setattr(core.VarBase, 'fill_diagonal_', fill_diagonal_)
@@ -182,10 +182,10 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
         y = y.reshape([1, -1])
 
     if inplace:
-        return core.ops.fill_diagonal_tensor_(x, y, 'dim1', dim1, 'dim2', dim2,
-                                              'offset', offset)
-    return core.ops.fill_diagonal_tensor(x, y, 'dim1', dim1, 'dim2', dim2,
-                                         'offset', offset)
+        return _C_ops.fill_diagonal_tensor_(x, y, 'dim1', dim1, 'dim2', dim2,
+                                            'offset', offset)
+    return _C_ops.fill_diagonal_tensor(x, y, 'dim1', dim1, 'dim2', dim2,
+                                       'offset', offset)
 
 
 def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
@@ -475,7 +475,7 @@ def flip(x, axis, name=None):
     if isinstance(axis, int):
         axis = [axis]
     if in_dygraph_mode():
-        return core.ops.flip(x, "axis", axis)
+        return _C_ops.flip(x, "axis", axis)
 
     helper = LayerHelper("flip", **locals())
     check_type(x, 'X', (Variable), 'flip')
@@ -1107,7 +1107,7 @@ def unique_consecutive(x,
         axis = [axis]
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
     if in_dygraph_mode():
-        out, inverse, counts = core.ops.unique_consecutive(
+        out, inverse, counts = _C_ops.unique_consecutive(
             x, 'dtype', attr_dtype, 'return_inverse', return_inverse,
             'return_counts', return_counts, 'axis', axis)
         outs = [out]
@@ -1838,7 +1838,7 @@ def expand_as(x, y, name=None):
             "you must set its stop_gradient to be False by "
             "some_var.stop_gradient = True, supporting "
             "some_var as the input 'x'.")
-    inputs = {"X": [x]}
+    inputs = {"X": [x], "Y": [y]}
 
     helper = LayerHelper('expand_as', **locals())
     dtype = helper.input_dtype(input_param_name='x')
@@ -2749,3 +2749,145 @@ def moveaxis(x, source, destination, name=None):
                  'XShape': [x_shape]},
         attrs={'axis': perm})
     return out
+
+
+def take_along_axis(arr, indices, axis):
+    """
+    Take values from the input array by given indices matrix along the designated axis.
+
+    Args:
+        arr (Tensor) : The input Tensor. Supported data types are float32 and float64.
+        indices (Tensor) : Indices to take along each 1d slice of arr. This must match the dimension of arr,
+            and need to broadcast against arr. Supported data type are int and int64.
+        axis (int) : The axis to take 1d slices along. 
+
+    Returns: 
+        Tensor: The indexed element, same dtype with arr
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x_np = np.array([[1, 2, 3], [4, 5, 6], [7,8,9]])
+            index_np = np.array([[0]])
+            x = paddle.to_tensor(x_np)
+            index = paddle.to_tensor(index_np)
+            axis = 0
+            result = paddle.take_along_axis(x, index, axis)
+            print(result)
+            # [[1, 2, 3]]
+    """
+    if (arr.shape == indices.shape):
+        broadcast_shape = arr.shape
+    else:
+        broadcast_shape_list = list(arr.shape)
+        broadcast_shape_list[axis] = 1
+        broadcast_shape = tuple(broadcast_shape_list)
+    if in_dygraph_mode():
+        indices = paddle.broadcast_to(indices, broadcast_shape)
+        return _C_ops.take_along_axis(arr, indices, 'Axis', axis)
+    check_variable_and_dtype(
+        arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'take_along_axis')
+    check_variable_and_dtype(indices, 'index', ['int32', 'int64'],
+                             'take_along_axis')
+    indices = paddle.broadcast_to(indices, broadcast_shape)
+    helper = LayerHelper('take_along_axis', **locals())
+    dtype = helper.input_dtype()
+    result = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="take_along_axis",
+        inputs={"Input": arr,
+                "Index": indices},
+        attrs={"Axis": axis},
+        outputs={"Result": result})
+    return result
+
+
+def put_along_axis(arr, indices, values, axis, reduce='assign'):
+    """
+    Put values into the destination array by given indices matrix along the designated axis.
+
+    Args:
+        arr (Tensor) : The Destination Tensor. Supported data types are float32 and float64.
+        indices (Tensor) : Indices to put along each 1d slice of arr. This must match the dimension of arr,
+            and need to broadcast against arr. Supported data type are int and int64.
+        axis (int) : The axis to put 1d slices along. 
+        reduce (string | optinal) : The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'.
+    Returns : 
+        Tensor: The indexed element, same dtype with arr
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x_np = np.array([[10, 30, 20], [60, 40, 50]])
+            index_np = np.array([[0]])
+            x = paddle.to_tensor(x_np)
+            index = paddle.to_tensor(index_np)
+            value = 99
+            axis = 0
+            result = paddle.put_along_axis(x, index, value, axis)
+            print(result)
+            # [[99, 99, 99],
+            # [60, 40, 50]]
+
+    """
+    if (arr.shape == indices.shape):
+        broadcast_shape = arr.shape
+    else:
+        broadcast_shape_list = list(arr.shape)
+        broadcast_shape_list[axis] = 1
+        broadcast_shape = tuple(broadcast_shape_list)
+    if in_dygraph_mode():
+        indices = paddle.broadcast_to(indices, broadcast_shape)
+        values = paddle.to_tensor(values) if not isinstance(
+            values, paddle.Tensor) else values
+        values = paddle.broadcast_to(values, broadcast_shape)
+        return _C_ops.put_along_axis(arr, indices, values, "Axis", axis,
+                                     "Reduce", reduce)
+
+    check_variable_and_dtype(
+        arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'put_along_axis')
+    check_variable_and_dtype(indices, 'index', ['int32', 'int64'],
+                             'put_along_axis')
+    indices = paddle.broadcast_to(indices, broadcast_shape)
+    values = paddle.broadcast_to(values, broadcast_shape)
+    helper = LayerHelper('put_along_axis', **locals())
+    dtype = helper.input_dtype()
+    result = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="put_along_axis",
+        inputs={"Input": arr,
+                "Index": indices,
+                "Value": values},
+        attrs={"Axis": axis,
+               "Reduce": reduce},
+        outputs={"Result": result})
+    return result
+
+
+@inplace_apis_in_dygraph_only
+def put_along_axis_(arr, indices, values, axis, reduce='assign'):
+    r"""
+    Inplace version of ``put_along_axis`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_put_along_axis`.
+    """
+    if (arr.shape == indices.shape):
+        broadcast_shape = arr.shape
+    else:
+        broadcast_shape_list = list(arr.shape)
+        broadcast_shape_list[axis] = 1
+        broadcast_shape = tuple(broadcast_shape_list)
+
+    indices = paddle.broadcast_to(indices, broadcast_shape)
+    values = paddle.to_tensor(values) if not isinstance(
+        values, paddle.Tensor) else values
+    values = paddle.broadcast_to(values, broadcast_shape)
+    return _C_ops.put_along_axis_(arr, indices, values, "Axis", axis, "Reduce",
+                                  reduce)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 9e59fbc56ad4c..c4a92b1486d58 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -905,6 +905,66 @@ def get_dtype(x, dtype):
     return out
 
 
+def nansum(x, axis=None, dtype=None, keepdim=False, name=None):
+    """
+    Computes the sum of tensor elements over the given axis, treating Not a Numbers (NaNs) as zero.
+
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32 or int64.
+        axis (int|list|tuple, optional): The dimensions along which the nansum is performed. If
+            :attr:`None`, nansum all elements of :attr:`x` and return a
+            Tensor with a single element, otherwise must be in the
+            range :math:`[-rank(x), rank(x))`. If :math:`axis[i] < 0`,
+            the dimension to reduce is :math:`rank + axis[i]`.
+        dtype (str, optional): The dtype of output Tensor. The default value is None, the dtype
+            of output is the same as input Tensor `x`.
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result Tensor will have one fewer dimension
+            than the :attr:`x` unless :attr:`keepdim` is true, default
+            value is False.
+        name (str, optional): The default value is None. Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: Results of summation operation on the specified axis of input Tensor `x`,
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # x is a Tensor with following elements:
+            #    [[nan, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, -nan, 0.7]]
+            # Each example is followed by the corresponding output tensor.
+            x = np.array([[float('nan'), 0.3, 0.5, 0.9],
+                            [0.1, 0.2, float('-nan'), 0.7]]).astype(np.float32)
+            x = paddle.to_tensor(x)
+            out1 = paddle.nansum(x)  # [2.7]
+            out2 = paddle.nansum(x, axis=0)  # [0.1, 0.5, 0.5, 1.6]
+            out3 = paddle.nansum(x, axis=-1)  # [1.7, 1.0]
+            out4 = paddle.nansum(x, axis=1, keepdim=True)  # [[1.7], [1.0]]
+
+            # y is a Tensor with shape [2, 2, 2] and elements as below:
+            #      [[[1, nan], [3, 4]],
+            #      [[5, 6], [-nan, 8]]]
+            # Each example is followed by the corresponding output tensor.
+            y = np.array([[[1, float('nan')], [3, 4]], 
+                            [[5, 6], [float('-nan'), 8]]])
+            y = paddle.to_tensor(y)
+            out5 = paddle.nansum(y, axis=[1, 2]) # [8, 19]
+            out6 = paddle.nansum(y, axis=[0, 1]) # [9, 18]
+    """
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'nansum')
+    check_type(axis, 'axis', (int, list, tuple, type(None)), 'nansum')
+
+    zero_tensor = paddle.zeros_like(x)
+    tmp_tensor = paddle.where(isnan(x), zero_tensor, x)
+    return sum(tmp_tensor, axis, dtype, keepdim, name)
+
+
 @templatedoc(op_type="sum")
 def add_n(inputs, name=None):
     """
@@ -1058,6 +1118,38 @@ def mm(input, mat2, name=None):
     Returns:
         Tensor: The product Tensor.
 
+    ::
+
+        * example 1:
+
+        input: [B, ..., M, K], mat2: [B, ..., K, N]
+        out: [B, ..., M, N]
+
+        * example 2:
+
+        input: [B, M, K], mat2: [B, K, N]
+        out: [B, M, N]
+
+        * example 3:
+
+        input: [B, M, K], mat2: [K, N]
+        out: [B, M, N]
+
+        * example 4:
+
+        input: [M, K], mat2: [K, N]
+        out: [M, N]
+
+        * example 5:
+
+        input: [B, M, K], mat2: [K]
+        out: [B, M]
+
+        * example 6:
+
+        input: [K], mat2: [K]
+        out: [1]
+
     Examples:
         .. code-block:: python
 
@@ -1194,6 +1286,185 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
         type="addmm", inputs=inputs, attrs=attrs, outputs={"Out": out})
     return out
 
+def renorm(x, p, axis, max_norm):
+    """
+    **renorm**
+
+    This operator is used to calculate the p-norm along the axis,
+    suppose the input-shape on axis dimension has the value of T, then
+    the tensor is split into T parts, the p-norm should be calculated for each
+    part, if the p-norm for part i is larger than max-norm, then each element 
+    in part i should be re-normalized at the same scale so that part-i' p-norm equals
+    max-norm exactly, otherwise part-i stays unchanged.
+
+    Args:
+        x (Tensor): The input Tensor
+        p (float): The power of the norm operation.
+        axis (int): the dimension to slice the tensor.
+        max-norm (float): the maximal norm limit.
+
+    Returns:
+        Tensor: the renorm Tensor.
+
+    Examples:
+        ..  code-block:: python
+            
+            import paddle
+            input = [[[2.0,2,-2],[3,0.3,3]],[[2,-8,2],[3.1,3.7,3]]]
+            x = paddle.to_tensor(input,dtype='float32')
+            y = paddle.renorm(x, 1.0, 2, 2.05)
+            print(y)        
+    #        [[[ 0.40594056,  0.29285714, -0.41000000],
+    #          [ 0.60891086,  0.04392857,  0.61500001]],
+    #         [[ 0.40594056, -1.17142856,  0.41000000],
+    #          [ 0.62920785,  0.54178572,  0.61500001]]])
+    
+    """
+    input_shape = x.shape
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'renorm')
+    if not axis < len(input_shape):
+        raise ValueError("the axis:{} should be less then the shape's size {}:{}".format(axis,len(input_shape),input_shape))
+    if not axis >=0:
+        if not axis >= -1 * len(input_shape):
+            raise ValueError("the axis:{} should not be less than -1 * length of input_shape:{}".format(axis,-1 * len(input_shape)))
+        axis = axis + len(input_shape)
+    if in_dygraph_mode():
+        out = core.ops.renorm(x, 'p',p, 'axis',axis, 'max_norm', max_norm)
+        return out
+
+    inputs = {'X': x}
+    attrs = {'p': p, 'axis': axis, 'max_norm':max_norm}
+
+    helper = LayerHelper("renorm", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    helper.append_op(
+        type="renorm", inputs=inputs, attrs=attrs, outputs={"Out": out})
+    return out
+
+
+
+def inner(x, y, name=None):
+    """
+
+    Inner product of two input Tensor.
+    
+    Ordinary inner product for 1-D Tensors, in higher dimensions a sum product over the last axes.
+
+    Args:
+        x (Tensor): An N-D Tensor or a Scalar Tensor. If its not a scalar Tensor, its last dimensions must match y's.
+        y (Tensor): An N-D Tensor or a Scalar Tensor. If its not a scalar Tensor, its last dimensions must match x's.
+        name(str, optional): The default value is None. Normally there is no need for
+            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: The inner-product Tensor, the output shape is x.shape[:-1] + y.shape[:-1].
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.arange(1, 7).reshape((2, 3)).astype('float32')
+            y = paddle.arange(1, 10).reshape((3, 3)).astype('float32')
+            out = paddle.inner(x, y)
+            print(out)
+            #        ([[14, 32, 50],
+            #         [32, 77, 122]])
+
+
+    """
+    if x.size == 1 or y.size == 1:
+        return multiply(x, y)
+    else:
+        xshape = x.shape
+        yshape = y.shape
+        dstshape = list(xshape[:-1])+list(yshape[:-1])
+        if len(dstshape)==0:
+            dstshape = [1]
+        nx = x.reshape((-1, xshape[-1]))
+        ny = y.reshape((-1, yshape[-1]))
+
+        if in_dygraph_mode():
+            return _C_ops.matmul_v2(nx, ny.T).reshape(dstshape)
+
+        def __check_input(x, y):
+            var_names = {'x': x, 'y': y}
+            for name, val in var_names.items():
+                check_variable_and_dtype(val, name,
+                                        ['float16', 'float32', 'float64'], 'inner')
+            x_shape = list(xshape)
+            y_shape = list(yshape)
+
+            # check the inner 2 dimensions
+            if x_shape[-1] != y_shape[-1]:
+                if not ((x_shape[-1] == -1) or (y_shape[-1] == -1)):
+                    raise ValueError(
+                        "After performing an optional transpose, Input X's last dim should be "
+                        "equal to Y's last dim for multiplication "
+                        "prerequisites. But received X's shape: %s, Y's shape: %s\n"
+                        % (x_shape, y_shape))
+
+        __check_input(nx, ny)
+
+        helper = LayerHelper('inner', **locals())
+        out = helper.create_variable_for_type_inference(dtype=nx.dtype)
+        helper.append_op(
+            type='matmul_v2', inputs={'X': nx,
+                                'Y': ny.T}, outputs={'Out': out})
+        return out.reshape(dstshape)
+
+
+def outer(x, y, name=None):
+    """
+
+    Outer product of two Tensors.
+
+    Input is flattened if not already 1-dimensional.
+
+    Args:
+        x (Tensor): An N-D Tensor or a Scalar Tensor. 
+        y (Tensor): An N-D Tensor or a Scalar Tensor. 
+        name(str, optional): The default value is None. Normally there is no need for
+            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: The outer-product Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.arange(1, 4).astype('float32')
+            y = paddle.arange(1, 6).astype('float32')
+            out = paddle.outer(x, y)
+            print(out)
+            #        ([[1, 2, 3, 4, 5],
+            #         [2, 4, 6, 8, 10],
+            #         [3, 6, 9, 12, 15]])
+
+
+    """
+    nx = x.reshape((-1, 1))
+    ny = y.reshape((1, -1))
+
+    if in_dygraph_mode():
+        return _C_ops.matmul_v2(nx, ny)
+
+    def __check_input(x, y):
+        var_names = {'x': x, 'y': y}
+        for name, val in var_names.items():
+            check_variable_and_dtype(val, name,
+                                     ['float16', 'float32', 'float64'], 'inner')
+
+    __check_input(nx, ny)
+
+    helper = LayerHelper('outer', **locals())
+    out = helper.create_variable_for_type_inference(dtype=nx.dtype)
+    helper.append_op(
+        type='matmul_v2', inputs={'X': nx,
+                               'Y': ny}, outputs={'Out': out})
+    return out
+
 
 def logsumexp(x, axis=None, keepdim=False, name=None):
     r"""
@@ -1307,15 +1578,37 @@ def _check_input(x):
         type='inverse', inputs={'Input': [x] }, outputs={'Output': [out]})
     return out
 
+def _get_reduce_all_value(axis):
+    """
+    Internal function for max, min, amax and amin. 
+    It computes the attribute reduce_all value based on axis.
+    """
+    if axis is not None and not isinstance(axis, list):
+        if isinstance(axis, tuple):
+            axis = list(axis)
+        elif isinstance(axis, int):
+            axis= [axis]
+        else:
+            raise TypeError(
+                "The type of axis must be int, list or tuple, but received {}".format(type(axis)))
+
+    reduce_all = True if axis == None or axis == [] else False
+    axis = axis if axis != None and axis != [] else [0]
+    return reduce_all, axis
 
 def max(x, axis=None, keepdim=False, name=None):
     """
 
     Computes the maximum of tensor elements over the given axis.
 
+    Note:
+        The difference between max and amax is: If there are multiple maximum elements,
+        amax evenly distributes gradient between these equal values, 
+        while max propagates gradient to all of them.
+
+
     Args:
-        x(Tensor): A tensor, the data type is float32,
-            float64, int32, int64.
+        x(Tensor): A tensor, the data type is float32, float64, int32, int64.
         axis(int|list|tuple, optional): The axis along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
             `x` and return a Tensor with a single element,
@@ -1339,47 +1632,50 @@ def max(x, axis=None, keepdim=False, name=None):
 
             # data_x is a Tensor with shape [2, 4]
             # the axis is a int element
-
             x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
-                                  [0.1, 0.2, 0.6, 0.7]])
+                                  [0.1, 0.2, 0.6, 0.7]], 
+                                 dtype='float64', stop_gradient=False)
             result1 = paddle.max(x)
-            print(result1)
-            #[0.9]
+            result1.backward()
+            print(result1, x.grad) 
+            #[0.9], [[0., 0., 0., 1.], [0., 0., 0., 0.]]
+
+            x.clear_grad()
             result2 = paddle.max(x, axis=0)
-            print(result2)
-            #[0.2 0.3 0.6 0.9]
+            result2.backward()
+            print(result2, x.grad) 
+            #[0.2, 0.3, 0.6, 0.9], [[1., 1., 0., 1.], [0., 0., 1., 0.]]
+
+            x.clear_grad()
             result3 = paddle.max(x, axis=-1)
-            print(result3)
-            #[0.9 0.7]
+            result3.backward()
+            print(result3, x.grad) 
+            #[0.9, 0.7], [[0., 0., 0., 1.], [0., 0., 0., 1.]]
+
+            x.clear_grad()
             result4 = paddle.max(x, axis=1, keepdim=True)
-            print(result4)
-            #[[0.9]
-            # [0.7]]
+            result4.backward()
+            print(result4, x.grad) 
+            #[[0.9], [0.7]], [[0., 0., 0., 1.], [0., 0., 0., 1.]]
 
             # data_y is a Tensor with shape [2, 2, 2]
             # the axis is list 
-
             y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
-                                  [[5.0, 6.0], [7.0, 8.0]]])
+                                  [[5.0, 6.0], [7.0, 8.0]]],
+                                 dtype='float64', stop_gradient=False)
             result5 = paddle.max(y, axis=[1, 2])
-            print(result5)
-            #[4. 8.]
+            result5.backward()
+            print(result5, y.grad) 
+            #[4., 8.], [[[0., 0.], [0., 1.]], [[0., 0.], [0., 1.]]]
+
+            y.clear_grad()
             result6 = paddle.max(y, axis=[0, 1])
-            print(result6)
-            #[7. 8.]
+            result6.backward()
+            print(result6, y.grad) 
+            #[7., 8.], [[[0., 0.], [0., 0.]], [[0., 0.], [1., 1.]]]
     """
 
-    if axis is not None and not isinstance(axis, list):
-        if isinstance(axis, tuple):
-            axis = list(axis)
-        elif isinstance(axis, int):
-            axis= [axis]
-        else:
-            raise TypeError(
-                "The type of axis must be int, list or tuple, but received {}".format(type(axis)))
-
-    reduce_all = True if axis == None or axis == [] else False
-    axis = axis if axis != None and axis != [] else [0]
+    reduce_all, axis = _get_reduce_all_value(axis)
     if in_dygraph_mode():
         return _C_ops.reduce_max(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
@@ -1406,6 +1702,11 @@ def min(x, axis=None, keepdim=False, name=None):
 
     Computes the minimum of tensor elements over the given axis
 
+    Note:
+        The difference between min and amin is: If there are multiple minimum elements,
+        amin evenly distributes gradient between these equal values, 
+        while min propagates gradient to all of them.
+
     Args:
         x(Tensor): A tensor, the data type is float32, float64, int32, int64.
         axis(int|list|tuple, optional): The axis along which the minimum is computed.
@@ -1429,46 +1730,52 @@ def min(x, axis=None, keepdim=False, name=None):
 
             import paddle
 
-            # x is a tensor with shape [2, 4]
+            # data_x is a Tensor with shape [2, 4]
             # the axis is a int element
             x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
-                                  [0.1, 0.2, 0.6, 0.7]])
+                                  [0.1, 0.2, 0.6, 0.7]], 
+                                 dtype='float64', stop_gradient=False)
             result1 = paddle.min(x)
-            print(result1)
-            #[0.1]
+            result1.backward()
+            print(result1, x.grad) 
+            #[0.1], [[0., 0., 0., 0.], [1., 0., 0., 0.]]
+
+            x.clear_grad()
             result2 = paddle.min(x, axis=0)
-            print(result2)
-            #[0.1 0.2 0.5 0.7]
+            result2.backward()
+            print(result2, x.grad) 
+            #[0.1, 0.2, 0.5, 0.7], [[0., 0., 1., 0.], [1., 1., 0., 1.]]
+
+            x.clear_grad()
             result3 = paddle.min(x, axis=-1)
-            print(result3)
-            #[0.2 0.1]
+            result3.backward()
+            print(result3, x.grad) 
+            #[0.2, 0.1], [[1., 0., 0., 0.], [1., 0., 0., 0.]]
+
+            x.clear_grad()
             result4 = paddle.min(x, axis=1, keepdim=True)
-            print(result4)
-            #[[0.2]
-            # [0.1]]
+            result4.backward()
+            print(result4, x.grad) 
+            #[[0.2], [0.1]], [[1., 0., 0., 0.], [1., 0., 0., 0.]]
 
-            # y is a Tensor with shape [2, 2, 2]
+            # data_y is a Tensor with shape [2, 2, 2]
             # the axis is list 
             y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
-                                  [[5.0, 6.0], [7.0, 8.0]]])
+                                  [[5.0, 6.0], [7.0, 8.0]]],
+                                 dtype='float64', stop_gradient=False)
             result5 = paddle.min(y, axis=[1, 2])
-            print(result5)
-            #[1. 5.]
+            result5.backward()
+            print(result5, y.grad) 
+            #[1., 5.], [[[1., 0.], [0., 0.]], [[1., 0.], [0., 0.]]]
+
+            y.clear_grad()
             result6 = paddle.min(y, axis=[0, 1])
-            print(result6)
-            #[1. 2.]
+            result6.backward()
+            print(result6, y.grad) 
+            #[1., 2.], [[[1., 1.], [0., 0.]], [[0., 0.], [0., 0.]]]
     """
 
-    if axis is not None and not isinstance(axis, list):
-        if isinstance(axis, tuple):
-            axis = list(axis)
-        elif isinstance(axis, int):
-            axis= [axis]
-        else:
-            raise TypeError(
-                "The type of axis must be int, list or tuple, but received {}".format(type(axis)))
-    reduce_all = True if axis == None or axis == [] else False
-    axis = axis if axis != None and axis != [] else [0]
+    reduce_all, axis = _get_reduce_all_value(axis)
     if in_dygraph_mode():
         return _C_ops.reduce_min(x, 'dim', axis, 'keep_dim', keepdim,
                                    'reduce_all', reduce_all)
@@ -1490,6 +1797,230 @@ def min(x, axis=None, keepdim=False, name=None):
         })
     return out
 
+def amax(x, axis=None, keepdim=False, name=None):
+    """
+    Computes the maximum of tensor elements over the given axis.
+
+    Note:
+        The difference between max and amax is: If there are multiple maximum elements,
+        amax evenly distributes gradient between these equal values, 
+        while max propagates gradient to all of them.
+
+    Args:
+        x(Tensor): A tensor, the data type is float32, float64, int32, int64,
+            the dimension is no more than 4.
+        axis(int|list|tuple, optional): The axis along which the maximum is computed.
+            If :attr:`None`, compute the maximum over all elements of
+            `x` and return a Tensor with a single element,
+            otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
+            If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the `x` unless :attr:`keepdim` is true, default
+            value is False.
+        name(str, optional): The default value is None.  Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor, results of maximum on the specified axis of input tensor,
+        it's data type is the same as `x`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            # data_x is a Tensor with shape [2, 4] with multiple maximum elements
+            # the axis is a int element
+
+            x = paddle.to_tensor([[0.1, 0.9, 0.9, 0.9],
+                                  [0.9, 0.9, 0.6, 0.7]], 
+                                 dtype='float64', stop_gradient=False)
+            # There are 5 maximum elements: 
+            # 1) amax evenly distributes gradient between these equal values, 
+            #    thus the corresponding gradients are 1/5=0.2;
+            # 2) while max propagates gradient to all of them, 
+            #    thus the corresponding gradient are 1.
+            result1 = paddle.amax(x)
+            result1.backward()
+            print(result1, x.grad) 
+            #[0.9], [[0., 0.2, 0.2, 0.2], [0.2, 0.2, 0., 0.]]
+
+            x.clear_grad()
+            result1_max = paddle.max(x)
+            result1_max.backward()
+            print(result1_max, x.grad) 
+            #[0.9], [[0., 1.0, 1.0, 1.0], [1.0, 1.0, 0., 0.]]
+
+            ###############################
+
+            x.clear_grad()
+            result2 = paddle.amax(x, axis=0)
+            result2.backward()
+            print(result2, x.grad) 
+            #[0.9, 0.9, 0.9, 0.9], [[0., 0.5, 1., 1.], [1., 0.5, 0., 0.]]
+
+            x.clear_grad()
+            result3 = paddle.amax(x, axis=-1)
+            result3.backward()
+            print(result3, x.grad) 
+            #[0.9, 0.9], [[0., 0.3333, 0.3333, 0.3333], [0.5, 0.5, 0., 0.]]
+
+            x.clear_grad()
+            result4 = paddle.amax(x, axis=1, keepdim=True)
+            result4.backward()
+            print(result4, x.grad) 
+            #[[0.9], [0.9]], [[0., 0.3333, 0.3333, 0.3333.], [0.5, 0.5, 0., 0.]]
+
+            # data_y is a Tensor with shape [2, 2, 2]
+            # the axis is list 
+            y = paddle.to_tensor([[[0.1, 0.9], [0.9, 0.9]],
+                                  [[0.9, 0.9], [0.6, 0.7]]],
+                                 dtype='float64', stop_gradient=False)
+            result5 = paddle.amax(y, axis=[1, 2])
+            result5.backward()
+            print(result5, y.grad) 
+            #[0.9., 0.9], [[[0., 0.3333], [0.3333, 0.3333]], [[0.5, 0.5], [0., 1.]]]
+
+            y.clear_grad()
+            result6 = paddle.amax(y, axis=[0, 1])
+            result6.backward()
+            print(result6, y.grad) 
+            #[0.9., 0.9], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]]
+    """
+
+    reduce_all, axis = _get_reduce_all_value(axis)
+    if in_dygraph_mode():
+        return _C_ops.reduce_amax(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
+
+    helper = LayerHelper('amax', **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'amax')
+
+    out = helper.create_variable_for_type_inference(
+            dtype=x.dtype)
+    helper.append_op(
+        type='reduce_amax',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={
+            'dim': axis,
+            'keep_dim': keepdim,
+            'reduce_all': reduce_all
+        })
+    return out
+
+def amin(x, axis=None, keepdim=False, name=None):
+    """
+
+    Computes the minimum of tensor elements over the given axis
+
+    Note:
+        The difference between min and amin is: If there are multiple minimum elements,
+        amin evenly distributes gradient between these equal values, 
+        while min propagates gradient to all of them.
+
+    Args:
+        x(Tensor): A tensor, the data type is float32, float64, int32, int64, 
+            the dimension is no more than 4.
+        axis(int|list|tuple, optional): The axis along which the minimum is computed.
+            If :attr:`None`, compute the minimum over all elements of
+            `x` and return a Tensor with a single element,
+            otherwise must be in the range :math:`[-x.ndim, x.ndim)`.
+            If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
+        keepdim(bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the `x` unless :attr:`keepdim` is true, default
+            value is False.
+        name(str, optional): The default value is None.  Normally there is no need for 
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor, results of minimum on the specified axis of input tensor,
+        it's data type is the same as input's Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            # data_x is a Tensor with shape [2, 4] with multiple minimum elements
+            # the axis is a int element
+
+            x = paddle.to_tensor([[0.2, 0.1, 0.1, 0.1],
+                                  [0.1, 0.1, 0.6, 0.7]], 
+                                 dtype='float64', stop_gradient=False)
+            # There are 5 minimum elements: 
+            # 1) amin evenly distributes gradient between these equal values, 
+            #    thus the corresponding gradients are 1/5=0.2;
+            # 2) while min propagates gradient to all of them, 
+            #    thus the corresponding gradient are 1.
+            result1 = paddle.amin(x)
+            result1.backward()
+            print(result1, x.grad) 
+            #[0.1], [[0., 0.2, 0.2, 0.2], [0.2, 0.2, 0., 0.]]
+
+            x.clear_grad()
+            result1_min = paddle.min(x)
+            result1_min.backward()
+            print(result1_min, x.grad) 
+            #[0.1], [[0., 1.0, 1.0, 1.0], [1.0, 1.0, 0., 0.]]
+
+            ###############################
+
+            x.clear_grad()
+            result2 = paddle.amin(x, axis=0)
+            result2.backward()
+            print(result2, x.grad) 
+            #[0.1, 0.1, 0.1, 0.1], [[0., 0.5, 1., 1.], [1., 0.5, 0., 0.]]
+
+            x.clear_grad()
+            result3 = paddle.amin(x, axis=-1)
+            result3.backward()
+            print(result3, x.grad) 
+            #[0.1, 0.1], [[0., 0.3333, 0.3333, 0.3333], [0.5, 0.5, 0., 0.]]
+
+            x.clear_grad()
+            result4 = paddle.amin(x, axis=1, keepdim=True)
+            result4.backward()
+            print(result4, x.grad) 
+            #[[0.1], [0.1]], [[0., 0.3333, 0.3333, 0.3333.], [0.5, 0.5, 0., 0.]]
+
+            # data_y is a Tensor with shape [2, 2, 2]
+            # the axis is list 
+            y = paddle.to_tensor([[[0.2, 0.1], [0.1, 0.1]],
+                                  [[0.1, 0.1], [0.6, 0.7]]],
+                                 dtype='float64', stop_gradient=False)
+            result5 = paddle.amin(y, axis=[1, 2])
+            result5.backward()
+            print(result5, y.grad) 
+            #[0.1., 0.1], [[[0., 0.3333], [0.3333, 0.3333]], [[0.5, 0.5], [0., 1.]]]
+
+            y.clear_grad()
+            result6 = paddle.amin(y, axis=[0, 1])
+            result6.backward()
+            print(result6, y.grad) 
+            #[0.1., 0.1], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]]
+    """
+
+    reduce_all, axis = _get_reduce_all_value(axis)
+    if in_dygraph_mode():
+        return _C_ops.reduce_amin(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
+
+    helper = LayerHelper('amin', **locals())
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'amin')
+
+    out = helper.create_variable_for_type_inference(
+            dtype=x.dtype)
+    helper.append_op(
+        type='reduce_amin',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={
+            'dim': axis,
+            'keep_dim': keepdim,
+            'reduce_all': reduce_all
+        })
+    return out
 
 def log1p(x, name=None):
     r"""
@@ -2860,6 +3391,51 @@ def lerp_(x, y, weight, name=None):
         raise ValueError("The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(out_shape, x.shape))
     return _C_ops.lerp_(x, y, weight)
 
+def erfinv(x, name=None):
+    r"""
+    The inverse error function of x, .
+
+    Equation:
+        .. math::
+
+            erfinv(erf(x)) = x.
+
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the shape and data type is the same with input.
+
+    Example:
+        .. code-block:: python
+
+            import paddle
+            
+            x = paddle.to_tensor([0, 0.5, -1.], dtype="float32")
+            out = paddle.erfinv(x)
+            # out: [0, 0.4769, -inf]
+
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'erfinv')
+
+    if in_dygraph_mode():
+        return _C_ops.erfinv(x)
+
+    helper = LayerHelper('erfinv', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(type='erfinv', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+@inplace_apis_in_dygraph_only
+def erfinv_(x, name=None):
+    r"""
+    Inplace version of ``erfinv`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_erfinv`.
+    """
+    check_type(x, 'x', (paddle.Tensor, Variable), 'erfinv')
+    return _C_ops.erfinv_(x)
+
 def rad2deg(x, name=None):
     """
     Convert each of the elements of input x from angles in radians to degrees.
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 67e5120a54b79..5adb937118303 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -79,6 +79,49 @@ def bernoulli(x, name=None):
     return out
 
 
+def poisson(x, name=None):
+    """
+    This OP returns a tensor filled with random number from a Poisson Distribution.
+
+    .. math::
+
+        out_i \sim Poisson (lambda = x_i)
+
+    Args:
+        x(Tensor):  A tensor with rate parameter of poisson Distribution. The data type 
+            should be float32, float64.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns: 
+        Tensor: A Tensor filled with random number with the same shape and dtype as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.set_device('cpu')
+            paddle.seed(100)
+
+            x = paddle.uniform([2,3], min=1.0, max=5.0)
+            out = paddle.poisson(x)
+            #[[2., 5., 0.],
+            # [5., 1., 3.]]
+
+    """
+
+    if in_dygraph_mode():
+        return _C_ops.poisson(x)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64"], "poisson")
+
+    helper = LayerHelper("poisson", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='poisson', inputs={'X': x}, outputs={'Out': out}, attrs={})
+    return out
+
+
 def multinomial(x, num_samples=1, replacement=False, name=None):
     """
     This OP returns a Tensor filled with random values sampled from a Multinomical
@@ -555,8 +598,8 @@ def uniform_(x, min=-1.0, max=1.0, seed=0, name=None):
             #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249], # random
             #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]] # random
     """
-    return core.ops.uniform_random_inplace_(x, 'min', min, 'max', max, 'seed',
-                                            seed)
+    return _C_ops.uniform_random_inplace_(x, 'min', min, 'max', max, 'seed',
+                                          seed)
 
 
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
@@ -937,3 +980,49 @@ def rand(shape, dtype=None, name=None):
 
     """
     return uniform(shape, dtype, min=0.0, max=1.0, name=name)
+
+
+def exponential_(x, lam=1.0, name=None):
+    """
+    This inplace OP fill input Tensor ``x`` with random number from a Exponential Distribution.
+
+    ``lam`` is :math:`\lambda` parameter of Exponential Distribution. 
+    
+    .. math::
+
+        f(x) = \lambda e^{-\lambda x}
+
+    Args:
+        x(Tensor):  Input tensor. The data type should be float32, float64.
+        lam(float, optional): :math:`\lambda` parameter of Exponential Distribution. Default, 1.0.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns: 
+        Tensor: Input Tensor ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.set_device('cpu')
+            paddle.seed(100)
+
+            x = paddle.empty([2,3])
+            x.exponential_()
+            # [[0.80643415, 0.23211166, 0.01169797],
+            #  [0.72520673, 0.45208144, 0.30234432]]
+
+    """
+    if in_dygraph_mode():
+        return _C_ops.exponential_(x, "lambda", lam)
+
+    check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential")
+
+    helper = LayerHelper("exponential", **locals())
+    helper.append_op(
+        type='exponential',
+        inputs={"X": x},
+        outputs={'Out': x},
+        attrs={"lambda": lam})
+    return x
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index f3587aa48ddcb..0685e276458d3 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -470,6 +470,59 @@ def sort(x, axis=-1, descending=False, name=None):
     return out
 
 
+def mode(x, axis=-1, keepdim=False, name=None):
+    """
+    This OP is used to find values and indices of the modes at the optional axis.
+
+    Args:
+        x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. Default is -1.
+        keepdim(bool, optional): Whether to keep the given axis in output. If it is True, the dimensions will be same as input x and with size one in the axis. Otherwise the output dimentions is one fewer than x since the axis is squeezed. Default is False.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
+
+    Examples:
+
+        .. code-block:: python
+
+           import paddle
+           
+           tensor = paddle.to_tensor([[[1,2,2],[2,3,3]],[[0,5,5],[9,9,0]]], dtype=paddle.float32)
+           res = paddle.mode(tensor, 2)
+           print(res)
+           # (Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+           #   [[2., 3.],
+           #    [5., 9.]]), Tensor(shape=[2, 2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+           #   [[1, 1],
+           #    [1, 0]]))
+           
+    """
+    if in_dygraph_mode():
+        return _C_ops.mode(x, "axis", axis, "keepdim", keepdim)
+
+    helper = LayerHelper("mode", **locals())
+    inputs = {"X": [x]}
+    attrs = {}
+    attrs['axis'] = axis
+    attrs['keepdim'] = keepdim
+
+    values = helper.create_variable_for_type_inference(dtype=x.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
+
+    helper.append_op(
+        type="mode",
+        inputs=inputs,
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs=attrs)
+    indices.stop_gradient = True
+    return values, indices
+
+
 def where(condition, x, y, name=None):
     r"""
     Return a tensor of elements selected from either $x$ or $y$, depending on $condition$.
@@ -838,3 +891,65 @@ def searchsorted(sorted_sequence,
                "right": right})
 
     return out
+
+
+def kthvalue(x, k, axis=None, keepdim=False, name=None):
+    """
+    This OP is used to find values and indices of the k-th smallest at the axis.
+
+    Args:
+        x(Tensor): A N-D Tensor with type float32, float64, int32, int64.
+        k(int): The k for the k-th smallest number to look for along the axis.
+        axis(int, optional): Axis to compute indices along. The effective range
+            is [-R, R), where R is x.ndim. when axis < 0, it works the same way
+            as axis + R. The default is None. And if the axis is None, it will computed as -1 by default.
+        keepdim(bool, optional): Whether to keep the given axis in output. If it is True, the dimensions will be same as input x and with size one in the axis. Otherwise the output dimentions is one fewer than x since the axis is squeezed. Default is False.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64.
+   
+    Examples:
+
+        .. code-block:: python
+    
+            import paddle
+            
+            x = paddle.randn((2,3,2))
+            # Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #       [[[ 0.22954939, -0.01296274],
+            #         [ 1.17135799, -0.34493217],
+            #         [-0.19550551, -0.17573971]],
+            #
+            #        [[ 0.15104349, -0.93965352],
+            #         [ 0.14745511,  0.98209465],
+            #         [ 0.10732264, -0.55859774]]])           
+            y = paddle.kthvalue(x, 2, 1)    
+            # (Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            # [[ 0.22954939, -0.17573971],
+            #  [ 0.14745511, -0.55859774]]), Tensor(shape=[2, 2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #  [[0, 2],
+            #  [1, 2]]))
+    """
+    if in_dygraph_mode():
+        if axis is not None:
+            return _C_ops.kthvalue(x, 'k', k, "axis", axis, "keepdim", keepdim)
+        else:
+            return _C_ops.kthvalue(x, 'k', k, "keepdim", keepdim)
+
+    helper = LayerHelper("kthvalue", **locals())
+    inputs = {"X": [x]}
+    attrs = {'k': k}
+    if axis is not None:
+        attrs['axis'] = axis
+    values = helper.create_variable_for_type_inference(dtype=x.dtype)
+    indices = helper.create_variable_for_type_inference(dtype="int64")
+
+    helper.append_op(
+        type="kthvalue",
+        inputs=inputs,
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs=attrs)
+    indices.stop_gradient = True
+    return values, indices
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 6a016e42b5ab8..45a663b016840 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -333,3 +333,127 @@ def median(x, axis=None, keepdim=False, name=None):
         newshape = out_tensor.shape
     out_tensor = out_tensor.reshape(newshape, name=name)
     return out_tensor
+
+
+def quantile(x, q, axis=None, keepdim=False):
+    """
+    Compute the quantile of the input along the specified axis.
+
+    Args:
+        x (Tensor): The input Tensor, it's data type can be float32, float64.
+        q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list, 
+            each q will be calculated and the first dimension of output is same to the number of ``q`` .
+        axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int.
+            ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
+            If ``axis`` is less than 0, it works the same way as :math:`axis + D`.
+            If ``axis`` is a list, quantile is calculated over all elements of given axises.
+            If ``axis`` is None, quantile is calculated over all elements of ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of quantile along ``axis`` of ``x``. If data type of ``x`` is float64, data type of results will be float64, otherwise data type will be float32.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.randn((2,3))
+            #[[-1.28740597,  0.49533170, -1.00698614],
+            # [-1.11656201, -1.01010525, -2.23457789]])
+
+            y1 = paddle.quantile(x, q=0.5, axis=[0, 1])
+            # y1 = -1.06333363
+
+            y2 = paddle.quantile(x, q=0.5, axis=1)
+            # y2 = [-1.00698614, -1.11656201]
+
+            y3 = paddle.quantile(x, q=[0.3, 0.5], axis=1)
+            # y3 =[[-1.11915410, -1.56376839],
+            #      [-1.00698614, -1.11656201]]
+
+            y4 = paddle.quantile(x, q=0.8, axis=1, keepdim=True)
+            # y4 = [[-0.10559537],
+            #       [-1.05268800]])
+    """
+    if not isinstance(x, Variable):
+        raise TypeError("input x should be a Tensor.")
+    dims = len(x.shape)
+    out_shape = x.shape
+    if axis is None:
+        x = paddle.flatten(x)
+        axis = 0
+        out_shape = [1] * dims
+    else:
+        if isinstance(axis, list):
+            if (len(axis) <= 0):
+                raise ValueError("axis should not be empty")
+            axis_src, axis_dst = [], []
+            for axis_single in axis:
+                if not isinstance(axis_single, int) or not (
+                        axis_single < dims and axis_single >= -dims):
+                    raise ValueError(
+                        "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
+                    )
+                if axis_single < 0:
+                    axis_single = axis_single + dims
+                axis_src.append(axis_single)
+                out_shape[axis_single] = 1
+            axis_dst = list(range(-len(axis), 0))
+            x = paddle.moveaxis(x, axis_src, axis_dst)
+            x = paddle.flatten(x, axis_dst[0], axis_dst[-1])
+            axis = axis_dst[0]
+        else:
+            if not isinstance(axis, int) or not (axis < dims and axis >= -dims):
+                raise ValueError(
+                    "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
+                )
+            if axis < 0:
+                axis += dims
+            out_shape[axis] = 1
+    indices = []
+    if isinstance(q, (int, float)):
+        if q < 0 or q > 1:
+            raise ValueError("q should be in range [0, 1]")
+        indices.append(q * (x.shape[axis] - 1))
+    elif isinstance(q, (list, tuple)):
+        if len(q) <= 0:
+            raise ValueError("q should not be empty")
+        for q_num in q:
+            if q_num < 0 or q_num > 1:
+                raise ValueError("q should be in range [0, 1]")
+            indices.append(q_num * (x.shape[axis] - 1))
+    else:
+        raise TypeError("Type of q should be int, float, list or tuple.")
+    indices = paddle.to_tensor(indices).astype(paddle.float32)
+    sorted_tensor = paddle.sort(x, axis)
+    indices_below = paddle.floor(indices).astype(paddle.int32)
+    indices_upper = paddle.ceil(indices).astype(paddle.int32)
+    outputs = []
+
+    # TODO(chenjianye): replace the for-loop to directly take elements.
+    for i in range(len(indices)):
+        if (indices_upper[i] != indices_below[i]):
+            tensor_below = paddle.take_along_axis(sorted_tensor,
+                                                  indices_below[i], axis)
+            tensor_upper = paddle.take_along_axis(sorted_tensor,
+                                                  indices_upper[i], axis)
+            weights = (indices[i] - indices_below[i]).astype(x.dtype)
+            out = paddle.lerp(tensor_below, tensor_upper, weights)
+        else:
+            out = paddle.take_along_axis(sorted_tensor, indices_below[i], axis)
+        if not keepdim:
+            out = paddle.squeeze(out, axis=axis)
+        else:
+            out = out.reshape(out_shape)
+        outputs.append(out)
+    if isinstance(q, (list, tuple)):
+        return paddle.stack(outputs, 0)
+    else:
+        return outputs[0]
diff --git a/python/paddle/text/viterbi_decode.py b/python/paddle/text/viterbi_decode.py
index 3eec29f26ada7..8f75addc52f83 100644
--- a/python/paddle/text/viterbi_decode.py
+++ b/python/paddle/text/viterbi_decode.py
@@ -16,6 +16,7 @@
 from ..fluid.framework import core, in_dygraph_mode
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type
+from paddle import _C_ops
 
 __all__ = ['viterbi_decode', 'ViterbiDecoder']
 
@@ -58,9 +59,8 @@ def viterbi_decode(potentials,
             scores, path = paddle.text.viterbi_decode(emission, transition, length, False) # scores: [3.37089300, 1.56825531], path: [[1, 0, 0], [1, 1, 0]]
     """
     if in_dygraph_mode():
-        return core.ops.viterbi_decode(potentials, transition_params, lengths,
-                                       'include_bos_eos_tag',
-                                       include_bos_eos_tag)
+        return _C_ops.viterbi_decode(potentials, transition_params, lengths,
+                                     'include_bos_eos_tag', include_bos_eos_tag)
     check_variable_and_dtype(potentials, 'input', ['float32', 'float64'],
                              'viterbi_decode')
     check_variable_and_dtype(transition_params, 'transitions',
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 0c410d9b66fe9..562a726aa29f2 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -15,9 +15,17 @@
     func : CastInferMeta
   kernel :
     func : cast
-    param : [x, out_dtype, x.dtype()]
+    param : [x, out_dtype]
     data_type : x
 
+- api : conj
+  args : (const Tensor& x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : conj
+
 - api : divide
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
@@ -36,6 +44,32 @@
   kernel : 
     func : dot
 
+- api : empty
+  args : (const ScalarArray& shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
+  output: Tensor
+  infer_meta : 
+    func : CreateInferMeta
+    param : [shape, dtype, layout]
+  kernel : 
+    func : empty
+    param : [shape]
+    data_type : dtype
+    backend : place
+    layout : layout
+  
+- api : empty_like
+  args : (const Tensor& x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
+  output: Tensor
+  infer_meta : 
+    func : CreateLikeInferMeta
+    param : [x, dtype, layout]
+  kernel : 
+    func : empty_like
+    param : []
+    data_type : dtype > x
+    backend : place > x
+    layout : layout > x
+
 - api : flatten
   args : (const Tensor& x, int start_axis, int stop_axis)
   output : Tensor
@@ -48,7 +82,7 @@
   args : (const ScalarArray& shape, const Scalar& value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
   output: Tensor
   infer_meta : 
-    func : FullInferMeta
+    func : CreateInferMeta
     param : [shape, dtype, layout]
   kernel : 
     func : full
@@ -61,7 +95,7 @@
   args : (const Tensor& x, const Scalar& value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
   output: Tensor
   infer_meta : 
-    func : FullLikeInferMeta
+    func : CreateLikeInferMeta
     param : [x, dtype, layout]
   kernel : 
     func : full_like
@@ -145,20 +179,3 @@
   args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
   output : Tensor
   invoke : full_like(x, 0, dtype, place, layout)
-
-# - api : full_like
-#   args : (const Tensor& x, const Scalar& value, DataType dtype, Backend place)->Tensor
-#   output: {Tensor : dtype}
-#   kernel : fill_any_like
-#   T : [dtype, x]
-#   backend : [place, x]
-#   layout : []
-#   InferMeta : UnchangedInferMeta(x)
-
-- api : conj
-  args : (const Tensor& x)
-  output : Tensor
-  infer_meta :
-    func : UnchangedInferMeta
-  kernel :
-    func : conj
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 029985475011e..35720ae32fe38 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -43,12 +43,11 @@ def __init__(self, api_item_yaml):
             if 'data_type' not in self.kernel or len(self.kernel[
                     'data_type']) == 0:
                 self.kernel['data_type'] = None
-            if 'param' not in self.kernel or len(self.kernel['param']) == 0:
+            if 'param' not in self.kernel:
                 self.kernel['param'] = None
 
             self.infer_meta = api_item_yaml['infer_meta']
-            if 'param' not in self.infer_meta or len(self.infer_meta[
-                    'param']) == 0:
+            if 'param' not in self.infer_meta:
                 self.infer_meta['param'] = None
 
     def parse_args(self, args_str):
@@ -343,12 +342,12 @@ def source_include(header_file_path):
 
 #include "paddle/pten/api/include/kernel_signature.h"
 #include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_declare.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/declarations.h"
 """
 
 
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 3113ffea786a0..f085eac1e358d 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -471,16 +471,17 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
     Normalize include_dirs, library_dir and other attributes in kwargs.
     """
     assert isinstance(kwargs, dict)
-    include_dirs = []
+    compile_include_dirs = []
     # NOTE: the "_compile_dir" argument is not public to users. It is only
     # reserved for internal usage. We do not guarantee that this argument
     # is always valid in the future release versions.
     compile_dir = kwargs.get("_compile_dir", None)
     if compile_dir:
-        include_dirs = _get_include_dirs_when_compiling(compile_dir)
+        compile_include_dirs = _get_include_dirs_when_compiling(compile_dir)
 
     # append necessary include dir path of paddle
-    include_dirs = kwargs.get('include_dirs', include_dirs)
+    include_dirs = list(kwargs.get('include_dirs', []))
+    include_dirs.extend(compile_include_dirs)
     include_dirs.extend(find_paddle_includes(use_cuda))
 
     kwargs['include_dirs'] = include_dirs
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index efdc6847f0056..9feda3d2dae6a 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -89,16 +89,35 @@ def _is_npu_available():
         return False
 
 
-def _run_dygraph_single(use_cuda, use_npu):
+def _is_xpu_available():
     """
-    Testing the simple network in dygraph mode using one CPU/GPU.
+    Check whether XPU is avaiable.
+    """
+    try:
+        assert len(paddle.static.xpu_places()) > 0
+        return True
+    except Exception as e:
+        logging.warning(
+            "You are using XPU version PaddlePaddle, but there is no XPU "
+            "detected on your machine. Maybe XPU devices is not set properly."
+            "\n Original Error is {}".format(e))
+        return False
+
+
+def _run_dygraph_single(use_cuda, use_xpu, use_npu):
+    """
+    Testing the simple network in dygraph mode using one CPU/GPU/XPU/NPU.
 
     Args:
         use_cuda (bool): Whether running with CUDA.
+        use_xpu (bool): Whether running with XPU.
+        use_npu (bool): Whether running with NPU.
     """
     paddle.disable_static()
     if use_cuda:
         paddle.set_device('gpu')
+    elif use_xpu:
+        paddle.set_device('xpu')
     elif use_npu:
         paddle.set_device('npu')
     else:
@@ -119,12 +138,14 @@ def _run_dygraph_single(use_cuda, use_npu):
     opt.step()
 
 
-def _run_static_single(use_cuda, use_npu):
+def _run_static_single(use_cuda, use_xpu, use_npu):
     """
-    Testing the simple network with executor running directly, using one CPU/GPU.
+    Testing the simple network with executor running directly, using one CPU/GPU/XPU/NPU.
 
     Args:
         use_cuda (bool): Whether running with CUDA.
+        use_xpu (bool): Whether running with XPU.
+        use_npu (bool): Whether running with NPU.
     """
     paddle.enable_static()
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -138,6 +159,8 @@ def _run_static_single(use_cuda, use_npu):
 
         if use_cuda:
             place = paddle.CUDAPlace(0)
+        elif use_xpu:
+            place = paddle.XPUPlace(0)
         elif use_npu:
             place = paddle.NPUPlace(0)
         else:
@@ -151,12 +174,14 @@ def _run_static_single(use_cuda, use_npu):
     paddle.disable_static()
 
 
-def _run_static_parallel(use_cuda, use_npu, device_list):
+def _run_static_parallel(use_cuda, use_xpu, use_npu, device_list):
     """
     Testing the simple network in data parallel mode, using multiple CPU/GPU.
 
     Args:
         use_cuda (bool): Whether running with CUDA.
+        use_xpu (bool): Whether running with XPU.
+        use_npu (bool): Whether running with NPU.
         device_list (int): The specified devices.
     """
     paddle.enable_static()
@@ -175,6 +200,9 @@ def _run_static_parallel(use_cuda, use_npu, device_list):
 
         if use_cuda:
             place = paddle.CUDAPlace(0)
+        elif use_xpu:
+            place = paddle.XPUPlace(0)
+            compiled_prog = train_prog
         elif use_npu:
             place = paddle.NPUPlace(0)
             compiled_prog = train_prog
@@ -210,19 +238,23 @@ def run_check():
 
     print("Running verify PaddlePaddle program ... ")
 
+    use_cuda = False
+    use_xpu = False
+    use_npu = False
+
     if paddle.is_compiled_with_cuda():
         use_cuda = _is_cuda_available()
-        use_npu = False
+    elif paddle.is_compiled_with_xpu():
+        use_xpu = _is_xpu_available()
     elif paddle.is_compiled_with_npu():
         use_npu = _is_npu_available()
-        use_cuda = False
-    else:
-        use_npu = False
-        use_cuda = False
 
     if use_cuda:
         device_str = "GPU"
         device_list = paddle.static.cuda_places()
+    elif use_xpu:
+        device_str = "XPU"
+        device_list = paddle.static.xpu_places()
     elif use_npu:
         device_str = "NPU"
         device_list = paddle.static.npu_places()
@@ -231,12 +263,12 @@ def run_check():
         device_list = paddle.static.cpu_places(device_count=2)
     device_count = len(device_list)
 
-    _run_static_single(use_cuda, use_npu)
-    _run_dygraph_single(use_cuda, use_npu)
+    _run_static_single(use_cuda, use_xpu, use_npu)
+    _run_dygraph_single(use_cuda, use_xpu, use_npu)
     print("PaddlePaddle works well on 1 {}.".format(device_str))
 
     try:
-        _run_static_parallel(use_cuda, use_npu, device_list)
+        _run_static_parallel(use_cuda, use_xpu, use_npu, device_list)
         print("PaddlePaddle works well on {} {}s.".format(device_count,
                                                           device_str))
         print(
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 965cf8b55e793..68cd3ae72a6aa 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -953,10 +953,10 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
             "Input features with shape should be (N, C, H, W)")
     output_channels = int(x.shape[1] / (pooled_height * pooled_width))
     if in_dygraph_mode():
-        return core.ops.psroi_pool(x, boxes, boxes_num, "output_channels",
-                                   output_channels, "spatial_scale",
-                                   spatial_scale, "pooled_height",
-                                   pooled_height, "pooled_width", pooled_width)
+        return _C_ops.psroi_pool(x, boxes, boxes_num, "output_channels",
+                                 output_channels, "spatial_scale",
+                                 spatial_scale, "pooled_height", pooled_height,
+                                 "pooled_width", pooled_width)
 
     helper = LayerHelper('psroi_pool', **locals())
     dtype = helper.input_dtype()
@@ -1064,7 +1064,7 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     pooled_height, pooled_width = output_size
     if in_dygraph_mode():
         assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        pool_out, argmaxes = core.ops.roi_pool(
+        pool_out, argmaxes = _C_ops.roi_pool(
             x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width",
             pooled_width, "spatial_scale", spatial_scale)
         return pool_out
@@ -1219,7 +1219,7 @@ def roi_align(x,
     pooled_height, pooled_width = output_size
     if in_dygraph_mode():
         assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        align_out = core.ops.roi_align(
+        align_out = _C_ops.roi_align(
             x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width",
             pooled_width, "spatial_scale", spatial_scale, "sampling_ratio",
             sampling_ratio, "aligned", aligned)
diff --git a/python/requirements.txt b/python/requirements.txt
index 50d28bd94fcb7..f2a4580a94e51 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,5 @@
 requests>=2.20.0
-numpy>=1.20 ; python_version>="3.5"
+numpy>=1.13
 protobuf>=3.1.0
 Pillow
 six
diff --git a/python/setup.py.in b/python/setup.py.in
index 5da5623bd455c..af0d59f6a3cd4 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -276,6 +276,7 @@ packages=['paddle',
           'paddle.incubate.tensor',
           'paddle.incubate.nn',
           'paddle.incubate.passes',
+          'paddle.distribution',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
@@ -307,7 +308,6 @@ packages=['paddle',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
           'paddle.fluid.dygraph.amp',
-          'paddle.fluid.eager',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.distributed',
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 55e7d199cd15b..fe8382faa0c34 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -10,3 +10,4 @@ paddle2onnx>=0.8.2
 scipy>=1.6
 prettytable
 distro
+numpy>=1.20
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 0b705f88ea424..92f806b7e8a84 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -107,11 +107,11 @@ for API_FILE in ${API_FILES[*]}; do
       # You can use http://caius.github.io/github_id/ to find Github user id.
       # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, dingjiaweiww 23093488, juncaipeng 52520497, zhangting2020 26615455, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676, Dong Daxiang 35550832, phlrain 43953930, qili93 16605440.
       if [ "${API_FILE}" == "CMakeLists.txt" ];then
-          echo_line="You must have one RD (wanghuancoder, luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
-          check_approval 1 6836917 46782768 26922892
+          echo_line="You must have one RD (wanghuancoder, luotao1, XiaoguangHu01 or qili93) approval for CMakeLists.txt, which manages the compilation parameter.\n"
+          check_approval 1 6836917 46782768 26922892 16605440
       elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-          echo_line="You must have one RD (lanxianghit (Recommend), phlrain or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
-          check_approval 1 6836917 47554610 43953930
+          echo_line="You must have one RD (lanxianghit (Recommend), phlrain, luotao1 or qili93) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
+          check_approval 1 6836917 47554610 43953930 16605440
       elif [ "${API_FILE}" == "python/requirements.txt" ];then
           echo_line="You must have one RD (phlrain) and one TPM (dingjiaweiww) and one QA (kolinwei) approval for python/requirements.txt, which manages the third-party python package.\n"
           check_approval 3 43953930 23093488 22165420
@@ -188,8 +188,8 @@ done
 FILTER=`git diff --name-only upstream/develop | grep -v "tools/"`
 HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH $FILTER | grep '^\+' | grep -o -m 1 "const_cast" || true`
 if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1) approval for the usage of const_cast.\n"
-    check_approval 1 46782768 12538138 6836917 22561442 6888866
+    echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1,qili93) approval for the usage of const_cast.\n"
+    check_approval 1 46782768 12538138 6836917 22561442 6888866 16605440
 fi
 
 HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "boost::get" || true`
@@ -213,8 +213,8 @@ fi
 NO_NPU_FILE=`git diff --name-only upstream/$BRANCH | grep -v "_npu.py"`
 HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH ${NO_NPU_FILE} | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
 if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
-    check_approval 1 22165420 6836917 46661762 26922892
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder, luotao1 or qili93) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    check_approval 1 22165420 6836917 46661762 26922892 16605440
   fi
 
 HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
@@ -229,6 +229,18 @@ if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 2 6888866 39303645
   fi
 
+HAS_MODIFIED_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/tensor" || true`
+if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must be approved by jim19930609 or chenwhql for paddle/fluid/framework/tensor. It is being modularized and refactored. Thanks!\n"
+    check_approval 1 22561442 22334008
+  fi
+
+HAS_MODIFIED_LOD_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/lod_tensor" || true`
+if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must be approved by jim19930609 or chenwhql for paddle/fluid/framework/lod_tensor. It is being modularized and refactored. Thanks!\n"
+    check_approval 1 22561442 22334008
+  fi
+
 ALLOCSHARED_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "*\.(h|cc)" || true`
 if [ "${ALLOCSHARED_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     ERROR_LINES=""
@@ -314,8 +326,8 @@ if [ "${OP_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     done
     if [ "${ERROR_LINES}" != "" ]; then
         ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="Using ShareDataWith or ShareBufferWith is not recommended. You must have one RD's (zhhsplendid (Recommend), zhiqiu or luotao1 or lanxianghit) approval to use these methods. For more information, please refer to https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-is-prohibited-in-OP. The error lines are as follows:${ERROR_LINES}"
-        check_approval 1 6836917 6888866 47554610 7913861
+        echo_line="Using ShareDataWith or ShareBufferWith is not recommended. You must have one RD's (zhhsplendid (Recommend), zhiqiu or luotao1 or lanxianghit or qili93) approval to use these methods. For more information, please refer to https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-is-prohibited-in-OP. The error lines are as follows:${ERROR_LINES}"
+        check_approval 1 6836917 6888866 47554610 7913861 16605440
     fi
 fi
 
@@ -345,8 +357,8 @@ if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     done
     if [ "${ERROR_LINES}" != "" ]; then
         ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (zhangting2020 (Recommend), luotao1 or phlrain) approval for the usage (either add or delete) of @skip_check_grad_ci. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Gradient-Check-Is-Required-for-Op-Test. The corresponding lines are as follows:\n${ERROR_LINES}\n"
-        check_approval 1 26615455 6836917 43953930
+        echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (zhangting2020 (Recommend), luotao1 or phlrain, qili93) approval for the usage (either add or delete) of @skip_check_grad_ci. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Gradient-Check-Is-Required-for-Op-Test. The corresponding lines are as follows:\n${ERROR_LINES}\n"
+        check_approval 1 26615455 6836917 43953930 16605440
     fi
 fi
 
diff --git a/tools/count_api_without_core_ops.py b/tools/count_api_without_core_ops.py
index a2093e34fbacb..5519859471ac9 100644
--- a/tools/count_api_without_core_ops.py
+++ b/tools/count_api_without_core_ops.py
@@ -22,6 +22,7 @@
 import hashlib
 import functools
 import platform
+from paddle import _C_ops
 
 __all__ = ['get_apis_with_and_without_core_ops', ]
 
@@ -207,7 +208,7 @@ def get_api_source_desc(modules):
 
     else:
         print("""Usage: 
-            1. Count and list all operator-raleated APIs that contains append_op but not core.ops.xx. 
+            1. Count and list all operator-raleated APIs that contains append_op but not _C_ops.xx. 
                 python ./count_api_without_core_ops.py -c paddle
             2. Print api and the md5 of source code of the api.
                 python ./count_api_without_core_ops.py -p paddle
diff --git a/tools/dockerfile/Dockerfile.paddle-npu-build b/tools/dockerfile/Dockerfile.paddle-npu-build
new file mode 100644
index 0000000000000..62361880cc6eb
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.paddle-npu-build
@@ -0,0 +1,5 @@
+FROM registry.baidubce.com/paddlepaddle/paddle-npu:latest-dev-cann5.0.2.alpha005-gcc82-x86_64-with-driver
+RUN apt-get install pigz -y
+RUN apt-get remove -y openjdk*
+CMD ["/bin/bash"]
+EXPOSE 22
diff --git a/tools/dockerfile/build_scripts/install_gcc.sh b/tools/dockerfile/build_scripts/install_gcc.sh
index a95bc99a6084a..0edd09a99ecb4 100644
--- a/tools/dockerfile/build_scripts/install_gcc.sh
+++ b/tools/dockerfile/build_scripts/install_gcc.sh
@@ -30,7 +30,7 @@ else
 fi
 
 if [ "$1" == "gcc82" ]; then
-  wget -q https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
+  wget -q --no-proxy https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz 
   tar -xvf gcc-8.2.0.tar.xz && \
   cd gcc-8.2.0 && \
   unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
@@ -44,7 +44,7 @@ if [ "$1" == "gcc82" ]; then
   ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
   cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
 elif [ "$1" == "gcc54" ]; then
-  wget -q https://paddle-ci.gz.bcebos.com/gcc-5.4.0.tar.gz
+  wget -q --no-proxy https://paddle-ci.gz.bcebos.com/gcc-5.4.0.tar.gz
   tar -xzf gcc-5.4.0.tar.gz && \
   cd gcc-5.4.0 && \
   unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index bb8f9deabe7eb..fd814b990161d 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -79,7 +79,7 @@ function make_cinn_dockerfile(){
 
 function make_ce_framework_dockcerfile(){
   dockerfile_name="Dockerfile.cuda11.2_cudnn8_gcc82_trt8"
-  sed "s/<baseimg>/11.2.0-cudnn8-devel-ubuntu18.04/g" ./Dockerfile.ubuntu18 >${dockerfile_name}
+  sed "s/<baseimg>/11.2.0-cudnn8-devel-ubuntu16.04/g" ./Dockerfile.ubuntu >${dockerfile_name}
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "7i RUN chmod 777 /tmp" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
@@ -100,9 +100,11 @@ function make_ce_framework_dockcerfile(){
     RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
     ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
   sed -i 's#RUN bash /build_scripts/install_trt.sh#RUN bash /build_scripts/install_trt.sh trt8034#g' ${dockerfile_name}
+  sed -i 's#28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz#b7/2d/ad02de84a4c9fd3b1958dc9fb72764de1aa2605a9d7e943837be6ad82337/pip-21.0.1.tar.gz#g' ${dockerfile_name}
+  sed -i 's#pip-20.0.1#pip-21.0.1#g' ${dockerfile_name}
+  sed -i 's#python setup.py install#python3.7 setup.py install#g' ${dockerfile_name}
 }
 
-
 function main() {
   make_ubuntu_dockerfile
   make_centos_dockerfile
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 55d1dcf005b50..4df27bfe4e923 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -1088,6 +1088,7 @@
     'test_fill_any_op',
     'test_frame_op',
     'test_linalg_pinv_op',
+    'test_linalg_lstsq_op',
     'test_gumbel_softmax_op',
     'test_matrix_power_op',
     'test_multi_dot_op',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 8a8e5d8910bae..694283264ca8f 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -727,5 +727,7 @@
     'test_class_center_sample_op',
     'test_fill_diagonal_tensor_op',
     'test_fill_any_op',
+    'test_lu_op',
     'test_margin_cross_entropy_op',
+    'test_pull_gpups_sparse_op',
 ]