diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51c0ef35f1efa..433081ee2256b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -256,8 +256,8 @@ option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
 option(WITH_ARM_BRPC "Supprot Brpc in Arm"    OFF)
 
 if(WITH_RECORD_BUILDTIME)
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
 else()            
     include(ccache) # set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
 endif()
diff --git a/README.md b/README.md
index 21e0aba8b48bf..048a273a7d78b 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm
 
 ## Installation
 
-### Latest PaddlePaddle Release: [v2.2](https://github.com/PaddlePaddle/Paddle/tree/release/2.2)
+### Latest PaddlePaddle Release: [v2.3](https://github.com/PaddlePaddle/Paddle/tree/release/2.3)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index 5029878af6199..d02f47142e775 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -90,9 +90,10 @@ endif()
 if (WITH_ASCEND_CL)
 macro(find_ascend_toolkit_version ascend_toolkit_version_info) 
     file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
-    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
-    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
+    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
+    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
     string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
+    STRING(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION)
     add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
     if(NOT ASCEND_TOOLKIT_VERSION)
         set(ASCEND_TOOLKIT_VERSION "???")
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 29625b2b52e18..8f955008fa079 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -19,7 +19,7 @@ SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            9a35435c18722ff17a48fb60bceac42bfdf78754)
+SET(MKLDNN_TAG            9b186765dded79066e0cd9c17eb70b680b76fb8e)
 
 
 # Introduce variables:
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index d5ccf1297922f..1c4a424995887 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220511")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -17,7 +17,7 @@ endif()
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220425")
+  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220511")
 else()
   SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 1df9e1497384b..3eb833b6b3d35 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -261,6 +261,12 @@ function(op_library TARGET)
     elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
         xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
     else()
+        # deal with CANN version control while registering NPU operators before build
+        if (WITH_ASCEND_CL)
+            if (CANN_VERSION LESS 504000)
+                list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc")
+            endif()
+        endif()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
             # Combine the cc source files.
diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index 166f8786337b1..adf3d74c26220 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -16,6 +16,10 @@ if(NOT WITH_XPU_KP)
     return()
 endif()
 
+set(LINK_FLAGS    "-Wl,--allow-multiple-definition")
+set(CMAKE_EXE_LINKER_FLAGS    "${LINK_FLAGS}")
+set(CMAKE_SHARED_LINKER_FLAGS "${LINK_FLAGS}")
+
 if(NOT XPU_TOOLCHAIN)
   set(XPU_TOOLCHAIN /workspace/output/XTDK-ubuntu_x86_64)
   get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index a7c3e2208ab74..96009ce722905 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -901,6 +901,9 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
 
   dev_ctx->Wait();
 
+  Tensor src_value_tensor(std::make_shared<phi::DenseTensor>(src->value()));
+  std::vector<int64_t> dst_shape = src_value_tensor.shape();
+
   if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + size_,
                   [&](int64_t row) { return row == cpu_rows_num_ptr[0]; })) {
     // During sparse communication, the number of each card is same.
@@ -940,8 +943,6 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
                                        &dst_rows_vector);
     dev_ctx->Wait();
 
-    Tensor src_value_tensor(std::make_shared<phi::DenseTensor>(src->value()));
-    std::vector<int64_t> dst_shape = src_value_tensor.shape();
     dst_shape[dst_shape.size() - 2] = rows_num;
     auto dst_dense_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(
         paddle::experimental::full(IntArray(dst_shape), 0,
@@ -971,8 +972,58 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
     *(src->mutable_value()) =
         *(std::dynamic_pointer_cast<phi::DenseTensor>(dst_value_tensor.impl()));
   } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("This case is not supported."));
+    std::vector<Tensor> rows_tensors;
+    std::vector<Tensor> values_tensors;
+
+    for (int i = 0; i < size_; ++i) {
+      std::vector<int64_t> value_tensor_shape = {
+          cpu_rows_num_ptr[i], dst_shape[dst_shape.size() - 1]};
+      Tensor rows_tensor = paddle::experimental::full(
+          IntArray({static_cast<int64_t>(cpu_rows_num_ptr[i])}), 0,
+          DataType::INT64, inner_place_);
+      Tensor values_tensor = paddle::experimental::full(
+          IntArray(value_tensor_shape), 0, src->value().dtype(), inner_place_);
+      std::vector<phi::DenseTensor> rows_dense_vector;
+      std::vector<phi::DenseTensor> values_dense_vector;
+
+      if (i == rank_) {
+        auto *rows_dense_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl())
+                .get();
+        framework::TensorFromVector<int64_t>(src_rows, *dev_ctx,
+                                             rows_dense_tensor);
+        values_tensor.set_impl(
+            std::make_shared<phi::DenseTensor>(src->value()));
+      }
+      rows_dense_vector.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl()));
+      values_dense_vector.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(values_tensor.impl()));
+
+      auto b_opts = BroadcastOptions();
+      b_opts.source_rank = i;
+      process_group_->Broadcast(rows_dense_vector, rows_dense_vector, b_opts);
+      process_group_
+          ->Broadcast(values_dense_vector, values_dense_vector, b_opts)
+          ->Wait();
+      rows_tensors.push_back(rows_tensor);
+      values_tensors.push_back(values_tensor);
+    }
+
+    Tensor dst_rows_tensor =
+        paddle::experimental::concat(rows_tensors, phi::Scalar(0));
+    framework::Vector<int64_t> dst_rows_vector(rows_num, 0);
+    auto *dst_rows_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_rows_tensor.impl())
+            .get();
+    framework::TensorToVector<int64_t>(*dst_rows_dense_tensor, *dev_ctx,
+                                       &dst_rows_vector);
+    src->set_rows(dst_rows_vector);
+
+    Tensor dst_values_tensor =
+        paddle::experimental::concat(values_tensors, phi::Scalar(0));
+    *(src->mutable_value()) = *(
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_values_tensor.impl()));
   }
 }
 
diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
index 36bafc943701f..efaa48470a8bd 100644
--- a/paddle/fluid/distributed/ps/service/heter_client.h
+++ b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -171,19 +171,16 @@ class HeterClient {
   // switch client singleton
   static std::shared_ptr<HeterClient> GetSwitchInstance(
       const std::vector<std::string>& peer_endpoints, int32_t peer_role) {
+    std::unique_lock<std::mutex> lock(mtx_);
+    if (peer_endpoints.empty()) {
+      VLOG(4) << "init switch client failed, null peer_endpoints";
+    }
+    VLOG(4) << "peer role is: " << peer_role
+            << ", addr is: " << peer_endpoints[0];
     if (switch_s_instance_ == nullptr) {
-      std::unique_lock<std::mutex> lock(mtx_);
-      if (peer_endpoints.empty()) {
-        VLOG(4) << "init switch client failed, null peer_endpoints";
-      }
-      VLOG(4) << "peer role is: " << peer_role
-              << ", addr is: " << peer_endpoints[0];
-      if (switch_s_instance_ == nullptr) {
-        switch_s_instance_.reset(new HeterClient());
-        switch_s_instance_->SetPeerSwitchList(peer_endpoints);
-        switch_s_instance_->InitClientChannels(false, peer_endpoints,
-                                               peer_role);
-      }
+      switch_s_instance_.reset(new HeterClient());
+      switch_s_instance_->SetPeerSwitchList(peer_endpoints);
+      switch_s_instance_->InitClientChannels(false, peer_endpoints, peer_role);
     }
     return switch_s_instance_;
   }
diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
index 0753a6799c1be..fd38a030ff366 100755
--- a/paddle/fluid/distributed/ps/service/heter_server.cc
+++ b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -125,6 +125,9 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard(
     brpc::Controller* cntl) {
   VLOG(4) << "entering SaveInSwitchWithShard";
   int32_t group_id = request->group_id();
+  if (group_id >= FLAGS_heter_world_size) {
+    LOG(ERROR) << "group id exceed maxmium";
+  }
   auto& local_shard = _local_shards[group_id];
   auto& request_io_buffer = cntl->request_attachment();
   butil::IOBufBytesIterator io_buffer_itr(request_io_buffer);
@@ -132,11 +135,11 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard(
     const auto& var_name = request->send_var_names(idx);
     const auto& var_size = request->vars_len(idx);
     WaitForVarsConsumed(group_id, var_name);
+    std::unique_lock<std::mutex> lk(scope_mutex_);
     auto& value = local_shard[var_name];
     value.resize(var_size);
     io_buffer_itr.copy_and_forward(reinterpret_cast<void*>(value.data()),
                                    var_size);
-    std::unique_lock<std::mutex> lk(scope_mutex_);
     vars_ready_flag[group_id][var_name] = 1;
     VLOG(4) << "saved var_name: " << var_name << "is saved ready!";
   }
@@ -162,11 +165,11 @@ int SendAndRecvVariableHandler::QueryInSwitchWithShard(
     VLOG(4) << "req var name: " << req_var_name;
     response->add_send_var_names(req_var_name);
     WaitForVarsProduced(group_id, req_var_name);
+    std::unique_lock<std::mutex> lk(scope_mutex_);
     auto itr = local_shard.find(req_var_name);
     auto& value = itr.value();
     response_io_buffer.append(value.data(), value.size());
     value.resize(0);  // 清空内存
-    std::unique_lock<std::mutex> lk(scope_mutex_);
     vars_ready_flag[group_id][req_var_name] = 0;
     VLOG(4) << "query var_name: " << req_var_name << "is consumed ready!";
   }
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 0d3d23be4e8d1..926bb7e7c9fd3 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -109,7 +109,7 @@ class PSClient {
                                          size_t table_id) = 0;  // 保留
 
   // firstly push dense param for parameter server
-  // this is neccessary because dense weight initialized in trainer on cold
+  // this is necessary because dense weight initialized in trainer on cold
   // start
   virtual std::future<int32_t> PushDenseParam(const Region *regions,
                                               size_t region_num,
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index a3fa80b3865e4..b53044b7493e0 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -80,7 +80,7 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   }
   for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
   paddle::framework::GpuPsCommGraph res;
-  unsigned int tot_len = 0;
+  int64_t tot_len = 0;
   for (int i = 0; i < task_pool_size_; i++) {
     tot_len += edge_array[i].size();
   }
@@ -88,8 +88,8 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   // res.node_size = ids.size();
   // res.neighbor_list = new int64_t[tot_len];
   // res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
-  res.init_on_cpu(tot_len, (unsigned int)ids.size());
-  unsigned int offset = 0, ind = 0;
+  res.init_on_cpu(tot_len, ids.size());
+  int64_t offset = 0, ind = 0;
   for (int i = 0; i < task_pool_size_; i++) {
     for (int j = 0; j < (int)node_array[i].size(); j++) {
       res.node_list[ind] = node_array[i][j];
@@ -126,8 +126,8 @@ int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
       _db->put(src_id % shard_num % task_pool_size_, ch,
                sizeof(int) * 2 + sizeof(int64_t), (char *)data, len);
     }
-    _db->flush(src_id % shard_num % task_pool_size_);
-    std::string x;
+    // _db->flush(src_id % shard_num % task_pool_size_);
+    // std::string x;
     // if (_db->get(src_id % shard_num % task_pool_size_, ch, sizeof(int64_t) +
     // 2 * sizeof(int), x) ==0){
     // VLOG(0)<<"put result";
@@ -135,6 +135,18 @@ int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
     //   VLOG(0)<<"get an id "<<*((int64_t *)(x.c_str() + i));
     // }
     //}
+    // if(src_id == 429){
+    //   str = "";
+    //   _db->get(src_id % shard_num % task_pool_size_, ch,
+    //            sizeof(int) * 2 + sizeof(int64_t), str);
+    //   int64_t *stored_data = ((int64_t *)str.c_str());
+    //   int n = str.size() / sizeof(int64_t);
+    //   VLOG(0)<<"429 has "<<n<<"neighbors";
+    //   for(int i =0;i< n;i++){
+    //     VLOG(0)<<"get an id "<<*((int64_t *)(str.c_str() +
+    //     i*sizeof(int64_t)));
+    //   }
+    // }
   }
   return 0;
 }
@@ -146,6 +158,7 @@ char *GraphTable::random_sample_neighbor_from_ssd(
     return NULL;
   }
   std::string str;
+  VLOG(2) << "sample ssd for key " << id;
   char ch[sizeof(int) * 2 + sizeof(int64_t)];
   memset(ch, 0, sizeof(int));
   memcpy(ch + sizeof(int), &idx, sizeof(int));
@@ -178,6 +191,9 @@ char *GraphTable::random_sample_neighbor_from_ssd(
       memcpy(buff + i * Node::id_size, &data[pos], Node::id_size);
       // res.push_back(data[pos]);
     }
+    for (int i = 0; i < actual_size; i += 8) {
+      VLOG(2) << "sampled an neighbor " << *(int64_t *)&buff[i];
+    }
     return buff;
   }
   actual_size = 0;
@@ -376,7 +392,7 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
 }
 
 int32_t GraphTable::dump_edges_to_ssd(int idx) {
-  VLOG(0) << "calling dump edges to ssd";
+  VLOG(2) << "calling dump edges to ssd";
   const int64_t fixed_size = 10000;
   // std::vector<int64_t> edge_array[task_pool_size_];
   std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
@@ -387,9 +403,9 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
         [&, i, this]() -> int64_t {
           int64_t cost = 0;
           std::vector<Node *> &v = shards[i]->get_bucket();
-          std::vector<int64_t> s;
           size_t ind = i % this->task_pool_size_;
           for (size_t j = 0; j < v.size(); j++) {
+            std::vector<int64_t> s;
             for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
               s.push_back(v[j]->get_neighbor_id(k));
             }
@@ -405,7 +421,7 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
 }
 int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
   VLOG(0) << "make_complementary_graph";
-  const int64_t fixed_size = 10000;
+  const int64_t fixed_size = byte_size / 8;
   // std::vector<int64_t> edge_array[task_pool_size_];
   std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
   std::vector<std::future<int>> tasks;
@@ -416,7 +432,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
           std::vector<Node *> &v = shards[i]->get_bucket();
           size_t ind = i % this->task_pool_size_;
           for (size_t j = 0; j < v.size(); j++) {
-            size_t location = v[j]->get_id();
+            // size_t location = v[j]->get_id();
             for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
               count[ind][v[j]->get_neighbor_id(k)]++;
             }
@@ -424,19 +440,12 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
           return 0;
         }));
   }
-
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
   std::unordered_map<int64_t, int> final_count;
   std::map<int, std::vector<int64_t>> count_to_id;
   std::vector<int64_t> buffer;
-  for (auto p : edge_shards[idx]) {
-    delete p;
-  }
+  clear_graph(idx);
 
-  edge_shards[idx].clear();
-  for (size_t i = 0; i < shard_num_per_server; i++) {
-    edge_shards[idx].push_back(new GraphShard());
-  }
-  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
   for (int i = 0; i < task_pool_size_; i++) {
     for (auto &p : count[i]) {
       final_count[p.first] = final_count[p.first] + p.second;
@@ -447,13 +456,13 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
     count_to_id[p.second].push_back(p.first);
     VLOG(2) << p.first << " appear " << p.second << " times";
   }
-  // std::map<int,std::vector<int64_t>>::iterator iter= count_to_id.rbegin();
   auto iter = count_to_id.rbegin();
   while (iter != count_to_id.rend() && byte_size > 0) {
     for (auto x : iter->second) {
       buffer.push_back(x);
       if (buffer.size() >= fixed_size) {
         int64_t res = load_graph_to_memory_from_ssd(idx, buffer);
+        buffer.clear();
         byte_size -= res;
       }
       if (byte_size <= 0) break;
@@ -1265,13 +1274,14 @@ int32_t GraphTable::random_sample_neighbors(
           if (node == nullptr) {
 #ifdef PADDLE_WITH_HETERPS
             if (search_level == 2) {
-              VLOG(2) << "enter sample from ssd";
+              VLOG(2) << "enter sample from ssd for node_id " << node_id;
               char *buffer_addr = random_sample_neighbor_from_ssd(
                   idx, node_id, sample_size, rng, actual_size);
               if (actual_size != 0) {
-                std::shared_ptr<char> &buffer = buffers[idx];
+                std::shared_ptr<char> &buffer = buffers[idy];
                 buffer.reset(buffer_addr, char_del);
               }
+              VLOG(2) << "actual sampled size from ssd = " << actual_sizes[idy];
               continue;
             }
 #endif
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index b0d5add49565f..ec6f0e26a08fa 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -19,21 +19,25 @@
 #include "paddle/fluid/distributed/store/tcp_store.h"
 #include "paddle/fluid/distributed/store/tcp_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 
 namespace paddle {
 namespace distributed {
 
 namespace detail {
 
-constexpr int INFTIME = -1;
+constexpr int INFTIME = 10000;  // 10 seconds
 
-std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
-                                                  int nranks) {
-  return std::make_unique<MasterDaemon>(socket, nranks);
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket, int nranks,
+                                                  int stop_check_timeout) {
+  return std::make_unique<MasterDaemon>(socket, nranks, stop_check_timeout);
 }
 
-MasterDaemon::MasterDaemon(SocketType socket, int nranks)
-    : _listen_socket(socket), _nranks(nranks) {
+MasterDaemon::MasterDaemon(SocketType socket, int nranks,
+                           int stop_check_timeout)
+    : _listen_socket(socket),
+      _nranks(nranks),
+      _stop_check_timeout(stop_check_timeout) {
   _background_thread = std::thread{&MasterDaemon::run, this};
 }
 
@@ -86,6 +90,10 @@ void MasterDaemon::_do_get(SocketType socket) {
 
 void MasterDaemon::_do_stop(SocketType socket) {
   VLOG(3) << "MasterDaemon::_do_stop";
+  if (!_has_stop) {
+    _stop_time = std::chrono::system_clock::now();
+  }
+  _has_stop = true;
   ReplyType value = ReplyType::STOP_WAIT;
   tcputils::send_value<ReplyType>(socket, value);
   if (--_nranks == 0) {
@@ -115,6 +123,20 @@ void MasterDaemon::run() {
 #endif
 
   while (!_stop) {
+    auto end_time = std::chrono::system_clock::now();
+    if (_has_stop) {
+      std::chrono::duration<double> diff = end_time - _stop_time;
+      int elapsed_seconds = static_cast<int>(diff.count());
+      PADDLE_ENFORCE_LT(
+          elapsed_seconds, _stop_check_timeout,
+          platform::errors::Fatal(
+              "%d seconds elapsed after the first worker "
+              "stopped, so we think there may be something wrong and will "
+              "stop the master worker. You can use "
+              "'export FLAGS_stop_check_timeout=3600'"
+              " to change the timeout value in seconds. The default one is 900",
+              elapsed_seconds));
+    }
     for (size_t i = 0; i < fds.size(); i++) {
       fds[i].revents = 0;
     }
@@ -173,10 +195,12 @@ void MasterDaemon::run() {
   }
 }
 
-std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks,
+                                             int stop_check_timeout) {
   int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
   auto server = std::make_unique<TCPServer>();
-  server->_master_daemon = MasterDaemon::start(socket, nranks);
+  server->_master_daemon =
+      MasterDaemon::start(socket, nranks, stop_check_timeout);
   return server;
 }
 
@@ -219,10 +243,11 @@ std::vector<T> TCPClient::receive_vector() {
 }  // namespace detail
 
 TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
-                   size_t num_workers, std::chrono::seconds timeout)
+                   size_t num_workers, std::chrono::seconds timeout,
+                   int stop_check_timeout)
     : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
   if (_is_master) {
-    _server = detail::TCPServer::create(port, num_workers);
+    _server = detail::TCPServer::create(port, num_workers, stop_check_timeout);
   }
 
   _client = detail::TCPClient::connect(host, port);
diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h
index 17c1d8ea30a42..4ca9a673bf575 100644
--- a/paddle/fluid/distributed/store/tcp_store.h
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -34,9 +34,11 @@ namespace detail {
 class MasterDaemon {
  public:
   static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
-                                             int nranks);
+                                             int nranks,
+                                             int stop_check_timeout);
   MasterDaemon() = delete;
-  explicit MasterDaemon(SocketType listen_socket, int nranks);
+  explicit MasterDaemon(SocketType listen_socket, int nranks,
+                        int stop_check_timeout);
   ~MasterDaemon();
 
  private:
@@ -51,13 +53,17 @@ class MasterDaemon {
   std::unordered_map<std::string, std::vector<uint8_t>> _store;
   std::thread _background_thread{};
   int _nranks;
-  bool _stop = false;
+  int _stop_check_timeout;
+  bool _stop = false;  // all workers stopped
+  std::chrono::time_point<std::chrono::system_clock> _stop_time;
+  bool _has_stop = false;  // at least one worker stopped
 };
 
 class TCPServer {
  public:
   TCPServer() = default;
-  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
+  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks,
+                                           int stop_check_timeout);
 
  private:
   std::unique_ptr<MasterDaemon> _master_daemon;
@@ -93,7 +99,8 @@ class TCPStore : public Store {
   static constexpr std::uint16_t kDefaultPort = 6170;
   explicit TCPStore(std::string host, uint16_t port = kDefaultPort,
                     bool is_master = false, size_t num_workers = 1,
-                    std::chrono::seconds timeout = tcputils::kDefaultTimeout);
+                    std::chrono::seconds timeout = tcputils::kDefaultTimeout,
+                    int stop_check_timeout = 900);
 
   ~TCPStore();
 
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 53ac895bfbccb..11c98e5da9dde 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(eager_deps phi_api phi_dygraph_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
+set(eager_deps phi_api phi_dygraph_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta eager_nan_inf_utils grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
 
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
@@ -18,6 +18,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
 endif()
 
+cc_library(eager_nan_inf_utils SRCS nan_inf_utils.cc DEPS phi_tensor nan_inf_utils enforce)
 cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
 
 cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 857f1be1f7ae0..2ed44ce489934 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -34,22 +34,9 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
     *tensor = t;
   } else {
     // Accumulation
-    PADDLE_ENFORCE_EQ(t.initialized(), true,
-                      paddle::platform::errors::Fatal(
-                          "We can only accumulate initialized tensor, but we "
-                          "got tensor: %s is empty please check you network "
-                          "and make sure it creates grads.",
-                          t.name()));
-    PADDLE_ENFORCE_NOT_NULL(
-        tensor, paddle::platform::errors::Fatal(
-                    "We can only accumulate initialized tensor to non-nullptr "
-                    "tensor but we got nullptr please check you network "
-                    "and make sure it creates grads."));
-
-    if (t.is_dense_tensor()) {
-      if (tensor->is_dense_tensor()) {
+    if (LIKELY(t.is_dense_tensor())) {
+      if (LIKELY(tensor->is_dense_tensor())) {
         paddle::imperative::TensorAdd<paddle::experimental::Tensor>(t, tensor);
-
       } else {
         // TODO(jiabin): Support Other TensorBase later
         // TODO(zhanlve): Replace SelectedRowsAddTensor with
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 44e78c3bbf193..3c18efea20349 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -17,11 +17,12 @@
 
 #include <atomic>
 #include <memory>
+#include "paddle/fluid/eager/type_defs.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/utils/small_vector.h"
+
 namespace egr {
-constexpr size_t kSlotSmallVectorSize = 15U;
 class UniqueNameGenerator {
  public:
   explicit UniqueNameGenerator(std::string prefix = "") : prefix_(prefix) {}
@@ -77,7 +78,8 @@ class Controller {
     op_meta_info_map_.insert(map.begin(), map.end());
   }
 
-  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>&
+  std::unordered_map<std::string,
+                     std::vector<std::vector<std::unordered_map<int, int>>>>&
   GetCustomEdgesSlotMap() {
     return custom_edges_slot_map_;
   }
@@ -89,8 +91,10 @@ class Controller {
       new paddle::imperative::Tracer()};
   std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>
       op_meta_info_map_;
-  /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/
-  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>
+  /* op_type : {{{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}},
+   * {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}}*/
+  std::unordered_map<std::string,
+                     std::vector<std::vector<std::unordered_map<int, int>>>>
       custom_edges_slot_map_;
   DISABLE_COPY_AND_ASSIGN(Controller);
 };
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 44fa8461f2fe9..3edd13ccd597f 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1156,28 +1156,20 @@ static std::string GenerateGradNodeCreationContent(
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
-    const std::unordered_set<std::string>& no_need_buffer_ins =
-        iter.GetNoNeedBufferInputs();
     for (auto& kv : grad_ins_fwd_slotname_map) {
       const std::string& tensor_wrapper_name = kv.second;
-      std::string full_reserved = "false";
-      if (fwd_outputs_name_pos_map.find(tensor_wrapper_name) ==
-              fwd_outputs_name_pos_map.end() &&
-          !no_need_buffer_ins.count(tensor_wrapper_name)) {
-        full_reserved = "true";
-      }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "      grad_node->SetTensorWrapper%s(%s, %s);\n";
+          "      grad_node->SetTensorWrapper%s(%s);\n";
       // Replace output directly with input in inplace op.
       if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
         auto inplace_input_name = inplace_map[tensor_wrapper_name];
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
-            LegalizeVarName(inplace_input_name), full_reserved);
+            LegalizeVarName(inplace_input_name));
       } else {
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
-            LegalizeVarName(tensor_wrapper_name), full_reserved);
+            LegalizeVarName(tensor_wrapper_name));
       }
     }
   }
@@ -2592,7 +2584,6 @@ static std::string GenerateGradNodeHeaderContents(
 
       std::string tensor_wrapper_arg_str;
       std::string tensor_wrapper_body_str;
-      std::string full_reserved_str = "full_reserved";
       std::string no_need_buffer_str = "false";
       if (no_need_buffer_ins.count(tensor_wrapper_name)) {
         no_need_buffer_str = "true";
@@ -2610,12 +2601,12 @@ static std::string GenerateGradNodeHeaderContents(
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
             "for(const auto& eager_tensor : %s) {\n"
-            "          %s.emplace_back( egr::TensorWrapper(eager_tensor, %s "
-            "/*full_reserved*/, %s) );\n"
+            "          %s.emplace_back( egr::TensorWrapper(eager_tensor "
+            ", %s) );\n"
             "      }\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
-            struct_tensor_wrapper_name, full_reserved_str, no_need_buffer_str);
+            struct_tensor_wrapper_name, no_need_buffer_str);
 
         const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
             "for (auto tw: %s)   {\n"
@@ -2636,22 +2627,20 @@ static std::string GenerateGradNodeHeaderContents(
             TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/, %s);\n";
+            "%s = egr::TensorWrapper(%s, %s);\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
-            tensor_wrapper_name, full_reserved_str, no_need_buffer_str);
+            tensor_wrapper_name, no_need_buffer_str);
 
         const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "   %s.clear();\n";
         clear_tensor_wrappers_str += paddle::string::Sprintf(
             CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
       }
-      std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "   void SetTensorWrapper%s(%s, %s) {\n     %s\n   }\n";
+          "   void SetTensorWrapper%s(%s) {\n     %s\n   }\n";
       set_tensor_wrappers_str += paddle::string::Sprintf(
           SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-          tensor_wrapper_arg_str, full_reserved_signature_str,
-          tensor_wrapper_body_str);
+          tensor_wrapper_arg_str, tensor_wrapper_body_str);
     }
   }
   VLOG(6) << "Generated TensorWrapper";
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index ad46ca6cb1c18..50dab6ce840a5 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -16,9 +16,9 @@ add_custom_target(eager_final_state_codegen
     COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py" 
             "--api_yaml_path=${api_yaml_path}"
             "--backward_yaml_path=${backward_yaml_path}"
-            "--forwards_cc_path=${tmp_forwards_cc_path}" 
+            "--forwards_cc_path=${tmp_forwards_cc_path}"
             "--forwards_h_path=${tmp_forwards_h_path}"
-            "--nodes_cc_path=${tmp_nodes_cc_path}" 
+            "--nodes_cc_path=${tmp_nodes_cc_path}"
             "--nodes_h_path=${tmp_nodes_h_path}"
     COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_cc_path} ${forwards_cc_path}
     COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_h_path} ${forwards_h_path}
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 9d95b9488d298..84403d58a6044 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -55,8 +55,8 @@ def ParseArguments():
 ## Code Gen Templates ##
 ########################
 SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = \
-"""  void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{
-    {} = egr::TensorWrapper({}, full_reserved, {});
+"""  void SetTensorWrapper{}(const paddle::experimental::Tensor& {}) {{
+    {} = egr::TensorWrapper({}, {});
   }}
 """
 
@@ -69,9 +69,9 @@ def ParseArguments():
 """
 
 SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = \
-"""  void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
+"""  void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}) {{
     for(const auto& eager_tensor : {}) {{
-      {}.emplace_back(egr::TensorWrapper(eager_tensor, full_reserved, {}));
+      {}.emplace_back(egr::TensorWrapper(eager_tensor, {}));
     }};
   }}
 """
@@ -146,10 +146,9 @@ class {} : public egr::GradNodeBase {{
 {}
 
   // Call grad_api function
-  VLOG(3) << \"Final State Running: \" << \"{}\";
+  VLOG(3) << \"Final State Running: {}\";
 {}
-
-  // Get Output
+  // Check NaN and Inf id needed
 {}
   // Get GradIn autograd_meta
 {}
@@ -175,6 +174,8 @@ class {} : public egr::GradNodeBase {{
 {}
   // Forward API Call
   VLOG(3) << \"Final State Running: \" << \"{}\";
+{}
+  // Check NaN and Inf if needed
 {}
   // Get Outputs
 {}
@@ -235,9 +236,11 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 #include "paddle/fluid/eager/to_static/run_program_op_node.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
 
 #include "paddle/phi/api/include/sparse_api.h"
 
+DECLARE_bool(check_nan_inf);
 {}
 """
 
@@ -262,7 +265,9 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/eager/amp_utils.h"
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
 
+DECLARE_bool(check_nan_inf);
 {}
 {}
 """
@@ -342,6 +347,10 @@ class {} : public egr::GradNodeBase {{
     if( {}.impl() ) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
 """
 
+CHECK_NAN_AND_INF_TEMPLATE = \
+"""  if (FLAGS_check_nan_inf) {{ egr::CheckTensorHasNanOrInf("{}", {}); }}
+"""
+
 
 #######################
 ## Generator Helpers ##
@@ -641,7 +650,7 @@ def GenerateNodeCreationCodes(self):
             pass_stop_gradient_args_list.append(output_autograd_meta_name)
         pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)
 
-        # Node Construction        
+        # Node Construction
         num_backward_inputs = len(forward_outputs_position_map.keys())
         num_backward_outputs = len(forward_inputs_position_map.keys())
         grad_node_name = GetGradNodeName(forward_api_name)
@@ -679,9 +688,9 @@ def GenerateNodeCreationCodes(self):
 
             if is_fwd_input:
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), true);"
+                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
                 else:
-                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, true);"
+                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
             else:
                 if num_fwd_outputs > 1:
@@ -691,9 +700,9 @@ def GenerateNodeCreationCodes(self):
                     fwd_output_pos = forward_outputs_position_map[name][1]
 
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), false);"
+                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
                 else:
-                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, false);"
+                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_output_tensor_wrappers_list.append(set_tensor_wrappers)
         set_input_tensor_wrappers_str = "\n".join(
             set_input_tensor_wrappers_list)
@@ -701,6 +710,7 @@ def GenerateNodeCreationCodes(self):
             set_output_tensor_wrappers_list)
 
         # SetGradOutMeta & SetEdges
+        grad_node_out_list = []
         set_grad_out_meta_list = []
         set_edges_list = []
         for name, (_, pos) in forward_inputs_position_map.items():
@@ -713,7 +723,7 @@ def GenerateNodeCreationCodes(self):
             if not has_corresponding_grad_output:
                 continue
 
-            input_autograd_meta_name = GetAutoGradMetaName(name)
+            grad_node_out_list.append(name)
             is_optional = (name in self.optional_inputs)
             if is_optional:
                 set_grad_out_meta = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}.get_ptr()), {pos});"
@@ -755,6 +765,7 @@ def GenerateNodeCreationCodes(self):
             set_input_tensor_wrappers_str, set_grad_out_meta_str,
             set_out_rank_str, set_history_str, set_grad_in_meta_str,
             set_retain_grad_str, set_output_tensor_wrappers_str)
+        self.grad_node_out_list = grad_node_out_list
 
     def run(self):
         # Basic Validation Check
@@ -903,10 +914,17 @@ def GenerateForwardDefinition(self, is_inplaced):
             else:
                 function_name = GetIntermediateAPIFunctionName(function_name)
 
-        forward_call_str = f"{indent}auto api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});"
+        api_out_type = "auto"
+        if is_inplaced and len(forward_outputs_position_map) == 1:
+            api_out_type = "auto&"
+        forward_call_str = f"{indent}{api_out_type} api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});"
         num_outputs = len(forward_outputs_position_map.keys()) - len(
             intermediate_outputs)
 
+        # Check Nan and Inf
+        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(function_name,
+                                                              "api_result")
+
         # Get Outputs
         get_outputs_str = ""
         for name, (rtype, pos) in forward_outputs_position_map.items():
@@ -924,11 +942,18 @@ def GenerateForwardDefinition(self, is_inplaced):
             returns_list[pos] = f"{name}"
 
             if IsPlainTensorType(rtype):
-                returns_type_list[pos] = "paddle::experimental::Tensor"
+                if is_inplaced and inplace_map and name in inplace_map.values():
+                    returns_type_list[pos] = "paddle::experimental::Tensor&"
+                else:
+                    returns_type_list[pos] = "paddle::experimental::Tensor"
             else:
                 assert IsVectorTensorType(rtype)
-                returns_type_list[
-                    pos] = "std::vector<paddle::experimental::Tensor>"
+                if is_inplaced and inplace_map and name in inplace_map.values():
+                    returns_type_list[
+                        pos] = "std::vector<paddle::experimental::Tensor>&"
+                else:
+                    returns_type_list[
+                        pos] = "std::vector<paddle::experimental::Tensor>"
 
         if num_outputs == 1:
             returns_str = returns_list[0]
@@ -937,7 +962,7 @@ def GenerateForwardDefinition(self, is_inplaced):
             returns_type_str = ", ".join(returns_type_list)
             returns_type_str = f"std::tuple<{returns_type_str}>"
             returns_str = ", ".join(returns_list)
-            returns_str = f"std::make_tuple({returns_str})"
+            returns_str = f"{returns_type_str}{{{returns_str}}}"
 
         # Node Creation Pre-Processing
         # 1. Get Input AutoGradMeta
@@ -1023,10 +1048,10 @@ def GenerateForwardDefinition(self, is_inplaced):
         self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
             returns_type_str, forward_function_name, inputs_args_definition_str,
             dygraph_event_str, amp_logic_str, inputs_autograd_meta_str,
-            forward_function_name, forward_call_str, get_outputs_str,
-            outputs_autograd_meta_str, compute_require_grad_args_str,
-            check_inplace_str, bump_inplace_version_str, node_creation_str,
-            returns_str)
+            forward_function_name, forward_call_str, check_nan_inf_str,
+            get_outputs_str, outputs_autograd_meta_str,
+            compute_require_grad_args_str, check_inplace_str,
+            bump_inplace_version_str, node_creation_str, returns_str)
         self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"
 
     def GenerateInplacedForwardDygraphFunctions(self):
@@ -1140,6 +1165,7 @@ def GenerateHigherOrderNodeCreationCode(self):
         next_grad_api_contents = self.next_grad_api_contents
 
         grad_node_creation_str = ""
+        grad_node_out_list = []
         if next_grad_api_contents:
             forward_api_contents = grad_api_contents
             forward_api_contents['api'] = forward_api_contents['backward_api']
@@ -1150,10 +1176,11 @@ def GenerateHigherOrderNodeCreationCode(self):
             next_node_generator.run()
             next_node_generator.GenerateNodeCreationCodes()
             grad_node_creation_str = next_node_generator.node_creation_str
+            grad_node_out_list = next_node_generator.grad_node_out_list
 
             self.RecordGrad2NextGradNameMapping(next_node_generator)
 
-        return grad_node_creation_str
+        return grad_node_creation_str, grad_node_out_list
 
     def GenerateNodeDeclaration(self):
         forward_op_name = self.forward_api_name
@@ -1214,7 +1241,8 @@ def GenerateNodeDeclaration(self):
             set_attribute_methods_str, tensor_wrapper_members_str,
             attribute_members_str)
 
-    def GenerateNodeDefinition(self, grad_node_creation_str):
+    def GenerateNodeDefinition(self, grad_node_creation_str,
+                               grad_node_out_list):
         namespace = self.namespace
         forward_api_name = self.forward_api_name
         backward_api_name = self.backward_api_name
@@ -1290,28 +1318,45 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             get_grad_in_args_list.append(get_attr_str)
 
         get_grad_in_args_str = "\n".join(get_grad_in_args_list)
+
+        # Grad Outputs
+        for name, (ttype, fwd_position,
+                   grad_api_position) in backward_grad_outputs_map.items():
+            transformed_tensor_name = self.TransformToNextGradName(name)
+            if IsPlainTensorType(ttype):
+                grad_api_args.append(f"api_output[{fwd_position}][0]")
+            else:
+                assert IsVectorTensorType(ttype)
+                grad_api_args.append(f"api_output[{fwd_position}]")
+
         grad_api_args_str = ", ".join(grad_api_args)
 
         # Grad Function Call String
+        slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
         grad_api_namespace = f"paddle::experimental::{namespace}"
-        grad_function_call_str = f"{indent}auto grad_api_result = {grad_api_namespace}{backward_api_name}({grad_api_args_str});"
+        grad_function_call_str = f"""
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});
+  paddle::small_vector<std::vector<paddle::experimental::Tensor*>, egr::kSlotSmallVectorSize> api_output({slot_num_bwd_outputs});
+  for (int i = 0; i < {slot_num_bwd_outputs}; ++i) {{
+    returns[i].resize(out_metas[i].size());
+    if(returns[i].size() == 0) {{
+      api_output[i].reserve(1);
+      api_output[i].push_back(nullptr);
+      continue;
+    }}
+    api_output[i].reserve(returns[i].size());
+    for (size_t j = 0; j < returns[i].size(); ++j) {{
+      api_output[i].push_back(&returns[i][j]);
+    }}
+  }}
+"""
 
-        # Get Grad Outputs
-        get_outputs_str = ""
-        num_outputs = len(backward_grad_outputs_map.keys())
-        for name, (ttype, fwd_position,
-                   grad_api_position) in backward_grad_outputs_map.items():
-            transformed_tensor_name = self.TransformToNextGradName(name)
+        grad_function_call_str = grad_function_call_str + f"{indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});"
 
-            if num_outputs == 1:
-                get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result;"
-            else:
-                if IsPlainTensorType(ttype):
-                    get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result[{grad_api_position}][0];"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result[{grad_api_position}];"
-            get_outputs_str += get_tensor_str + "\n"
+        # Check Nan and Inf
+        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(backward_api_name,
+                                                              "returns")
 
         # Prepare for Node Creation if Necessary
         inputs_autograd_meta_str = ""
@@ -1324,38 +1369,41 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             for name, (ttype, pos,
                        grad_api_position) in backward_grad_inputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
-
-                input_autograd_meta_name = GetAutoGradMetaName(
-                    transformed_tensor_name)
-                if IsPlainTensorType(ttype):
-                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                if transformed_tensor_name in grad_node_out_list:
+                    input_autograd_meta_name = GetAutoGradMetaName(
                         transformed_tensor_name)
-                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                    if IsPlainTensorType(ttype):
+                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
+                    else:
+                        assert IsVectorTensorType(ttype)
+                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                            transformed_tensor_name)
+                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
+                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
 
-                inputs_autograd_meta_list.append(input_autograd_meta)
-                compute_require_grad_args_list.append(input_autograd_meta_name)
+                    inputs_autograd_meta_list.append(input_autograd_meta)
+                    compute_require_grad_args_list.append(
+                        input_autograd_meta_name)
 
             # 2. Get TensorWrapper AutoGradMeta
             for name, (ttype, _, pos), in backward_forward_inputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
-
-                input_autograd_meta_name = GetAutoGradMetaName(
-                    transformed_tensor_name)
-                if IsPlainTensorType(ttype):
-                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                if transformed_tensor_name in grad_node_out_list:
+                    input_autograd_meta_name = GetAutoGradMetaName(
                         transformed_tensor_name)
-                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                    if IsPlainTensorType(ttype):
+                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
+                    else:
+                        assert IsVectorTensorType(ttype)
+                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                            transformed_tensor_name)
+                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
+                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+
+                    inputs_autograd_meta_list.append(input_autograd_meta)
+                    compute_require_grad_args_list.append(
+                        input_autograd_meta_name)
 
-                inputs_autograd_meta_list.append(input_autograd_meta)
-                compute_require_grad_args_list.append(input_autograd_meta_name)
             inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
             compute_require_grad_args_str = ",".join(
                 compute_require_grad_args_list)
@@ -1363,28 +1411,26 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             # 3. Get Output AutoGradMeta
             outputs_autograd_meta_list = []
             num_fwd_outputs = len(backward_grad_outputs_map.keys())
-            for name, (rtype, pos, _) in backward_grad_outputs_map.items():
+            for name, (rtype, pos,
+                       grad_api_position) in backward_grad_outputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
 
                 output_autograd_meta_name = GetAutoGradMetaName(
                     transformed_tensor_name)
                 output_autograd_meta_vec_name = GetAutoGradMetaVectorName(
                     transformed_tensor_name)
-                if num_fwd_outputs == 1:
-                    if IsPlainTensorType(rtype):
-                        output_autograd_meta = f"{indent}egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(rtype)
-                        output_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});\n"
-                        output_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                if IsPlainTensorType(rtype):
+                    output_autograd_meta = f"""
+  auto& {transformed_tensor_name} = returns[{pos}][0];
+  egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});"""
+
                 else:
-                    # Tuple api_result
-                    if IsPlainTensorType(rtype):
-                        output_autograd_meta = f"{indent}egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(rtype)
-                        output_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});\n"
-                        output_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                    assert IsVectorTensorType(rtype)
+                    output_autograd_meta = f"""
+  auto& {transformed_tensor_name} = returns[{pos}];
+  std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
+  std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};
+"""
 
                 outputs_autograd_meta_list.append(output_autograd_meta)
             outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
@@ -1392,28 +1438,14 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             compute_require_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n"
             compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});"
 
-        # Construct grad_api returns
-        slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
-        returns_str = f"{indent}paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});\n"
-        for name, (ttype, fwd_position,
-                   grad_api_position) in backward_grad_outputs_map.items():
-            transformed_tensor_name = self.TransformToNextGradName(name)
-
-            # Rearrange output order accordingly
-            if IsPlainTensorType(ttype):
-                returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n"
-            else:
-                assert IsVectorTensorType(ttype)
-                returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n"
-
-        returns_str += f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
+        returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
         returns_str += f"{indent}return returns;\n"
 
         grad_node_name = GetGradNodeName(forward_api_name)
 
         self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
             grad_node_name, fill_zero_str, get_grad_in_args_str, grad_node_name,
-            grad_function_call_str, get_outputs_str, inputs_autograd_meta_str,
+            grad_function_call_str, check_nan_inf_str, inputs_autograd_meta_str,
             outputs_autograd_meta_str, compute_require_grad_str,
             grad_node_creation_str, returns_str)
 
@@ -1426,16 +1458,17 @@ def run(self):
         ## Code Generation ##
         #####################
         # Higher-order GradNode generation
-        grad_node_creation_str = self.GenerateHigherOrderNodeCreationCode()
+        grad_node_creation_str, grad_node_out_list = self.GenerateHigherOrderNodeCreationCode(
+        )
 
         self.GenerateNodeDeclaration()
 
-        self.GenerateNodeDefinition(grad_node_creation_str)
+        self.GenerateNodeDefinition(grad_node_creation_str, grad_node_out_list)
 
 
 class DygraphYamlGenerator(YamlGeneratorBase):
     def __init__(self, api_yaml_path, backward_yaml_path):
-        # Parent members: 
+        # Parent members:
         # self.namespace
         # self.api_yaml_path
         # self.forward_api_list
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index b86685c205a5c..45e4665bd297c 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -100,7 +100,7 @@ def FindParsingFunctionFromAttributeType(atype):
 
     // Set Device ID
 {}
-    auto out = {}({});
+    decltype({}({})) out = {}({});
 
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
@@ -328,7 +328,7 @@ def GeneratePythonCFunction(self):
             dygraph_function_call_list[pos] = f"{name}"
         dygraph_function_call_str = ",".join(dygraph_function_call_list)
 
-        # Generate Python-C Function Definitions 
+        # Generate Python-C Function Definitions
         if is_forward_only:
             fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
                 "paddle::experimental::", namespace, forward_api_name)
@@ -344,7 +344,8 @@ def GeneratePythonCFunction(self):
         self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
             forward_api_name, pythonc_record_event_str, forward_api_name,
             get_eager_tensor_str, parse_attributes_str, set_device_str,
-            fwd_function_name, dygraph_function_call_str, return_str)
+            fwd_function_name, dygraph_function_call_str, fwd_function_name,
+            dygraph_function_call_str, return_str)
 
         # Set prefix of forward_api_name to avoid conflicts
         prefix = self.namespace.strip("::")
@@ -380,6 +381,7 @@ def GeneratePythonCFunction(self):
                 inplaced_forward_api_name, get_eager_tensor_str,
                 parse_attributes_str, set_device_str,
                 inplaced_fwd_function_name, dygraph_function_call_str,
+                inplaced_fwd_function_name, dygraph_function_call_str,
                 return_str)
 
             # Generate Python-C Function Registration
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index c5a121067be72..63b899f6d6b62 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -698,8 +698,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
       }
     }
 
-    VLOG(6) << "Running GradNode:" << node->name();
-
     // Check input
     EnforceGradNodeHasInput(node);
 
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 2bb86a86e8348..abdd8cadeed4c 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -15,10 +15,151 @@
 #include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace egr {
+
+static void ConstructFwdAndBwdMap(
+    const std::vector<paddle::OpMetaInfo>& vec_map,
+    const std::string& op_type) {
+  auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap();
+  if (in_out_map.find(op_type) != in_out_map.end()) {
+    if (in_out_map[op_type].size() == 2) {
+      VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> ";
+      return;
+    }
+  }
+
+  VLOG(7) << "Construct DoubleGrad's CustomEdgesSlotMap ";
+  auto inputs_names =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]);
+  auto outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]);
+  auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
+  auto grad_outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  auto grad_inputs_names =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
+  auto grad_attrs_names =
+      paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[2]);
+  std::vector<std::unordered_map<int, int>> res(5);
+  in_out_map[op_type].push_back(res);
+  // Prepare pos map for grad_outputs
+  VLOG(7) << "Prepare pos map for grad_outputs";
+  PADDLE_ENFORCE_LE(
+      grad_outputs_names.size(), inputs_names.size(),
+      paddle::platform::errors::InvalidArgument(
+          "Grad outputs num should be less equal than forward inputs num."));
+  for (size_t i = 0; i < grad_outputs_names.size(); i++) {
+    auto end = grad_outputs_names[i].find("@GRAD@GRAD");
+    if (end != std::string::npos) {
+      for (size_t j = 0; j < inputs_names.size(); j++) {
+        if (grad_outputs_names[i].substr(0, end + 5) == inputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                  << "'s No." << j << " inputs: " << inputs_names[j]
+                  << " related to No." << i
+                  << " grad_outputs: " << grad_outputs_names[i];
+          in_out_map[op_type][1][0][j] = i;
+        }
+      }
+    } else {
+      size_t end_n = grad_outputs_names[i].find("@GRAD@NEW");
+      if (end_n != std::string::npos) {
+        for (size_t j = 0; j < inputs_names.size(); j++) {
+          if (grad_outputs_names[i].substr(0, end_n) == inputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " inputs: " << inputs_names[j]
+                    << " related to No." << i
+                    << " grad_outputs: " << grad_outputs_names[i];
+            in_out_map[op_type][1][0][j] = i;
+          }
+        }
+      } else {
+        size_t end_one_grad = grad_outputs_names[i].find("@GRAD");
+        if (end_one_grad != std::string::npos) {
+          for (size_t j = 0; j < inputs_names.size(); j++) {
+            if (grad_outputs_names[i].substr(0, end_one_grad) ==
+                inputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                      << "'s No." << j << " inputs: " << inputs_names[j]
+                      << " related to No." << i
+                      << " grad_outputs: " << grad_outputs_names[i];
+              in_out_map[op_type][1][0][j] = i;
+            }
+          }
+        } else {
+          PADDLE_THROW(paddle::platform::errors::NotFound(
+              "All Grad outputs should be end of @GRAD@GRAD or @GRAD@NEW or "
+              "@GRAD and we got %s is not one of them, "
+              "please check your op and change to fit the rule.",
+              grad_outputs_names[i]));
+        }
+      }
+    }
+  }
+  // Prepare pos map for grad_inputs
+  for (size_t i = 0; i < grad_inputs_names.size(); i++) {
+    size_t end = grad_inputs_names[i].find("@GRAD@GRAD");
+    if (end != std::string::npos) {
+      for (size_t j = 0; j < outputs_names.size(); j++) {
+        if (grad_inputs_names[i].substr(0, end + 5) == outputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                  << "'s No." << j << " outputs: " << outputs_names[j]
+                  << " related to No." << i
+                  << " grad_inputs's grad: " << grad_inputs_names[i];
+          in_out_map[op_type][1][1][j] = i;
+        }
+      }
+    } else {
+      if (std::find(outputs_names.begin(), outputs_names.end(),
+                    grad_inputs_names[i]) != outputs_names.end()) {
+        for (size_t j = 0; j < outputs_names.size(); j++) {
+          if (grad_inputs_names[i] == outputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " outputs: " << outputs_names[j]
+                    << " related to No." << i
+                    << " grad_inputs fwd outputs: " << grad_inputs_names[i];
+            in_out_map[op_type][1][2][j] = i;
+          }
+        }
+      } else {
+        for (size_t j = 0; j < inputs_names.size(); j++) {
+          if (grad_inputs_names[i] == inputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " inputs: " << inputs_names[j]
+                    << " related to No." << i
+                    << " grad_inputs fwd inputs: " << grad_inputs_names[i];
+            in_out_map[op_type][1][3][j] = i;
+          }
+        }
+      }
+    }
+  }
+
+  // Prepare pos map for grad attrs_
+  for (size_t i = 0; i < grad_attrs_names.size(); i++) {
+    auto end =
+        std::find(attrs_names.begin(), attrs_names.end(), grad_attrs_names[i]);
+    PADDLE_ENFORCE_NE(end, attrs_names.end(),
+                      paddle::platform::errors::NotFound(
+                          "All Grad attrs should be one of forward attrs and "
+                          "we got %s is not one of them, please check your "
+                          "op and change to fit the rule.",
+                          grad_attrs_names[i]));
+    for (size_t j = 0; j < attrs_names.size(); j++) {
+      if (grad_attrs_names[i] == attrs_names[j]) {
+        VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                << "'s No." << j << " attrs: " << attrs_names[j]
+                << " related to No." << i
+                << " grad_attrs: " << grad_attrs_names[i];
+        in_out_map[op_type][1][4][j] = i;
+      }
+    }
+  }
+}
+
 paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                      kSlotSmallVectorSize>
 RunCustomOpNode::operator()(
@@ -38,10 +179,11 @@ RunCustomOpNode::operator()(
       tmp_ins(grad_inputs_name.size());
   VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
           << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
-  for (size_t i = 0; i < grads.size(); i++) {
-    if (map[1].find(i) != map[1].end()) {
-      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i];
-      tmp_ins[map[1][i]] = grads[i];
+  auto hooked_grads = ApplyGradientHooks(grads);
+  for (size_t i = 0; i < hooked_grads.size(); i++) {
+    if (map[0][1].find(i) != map[0][1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[0][1][i];
+      tmp_ins[map[0][1][i]] = hooked_grads[i];
     }
   }
 
@@ -69,28 +211,218 @@ RunCustomOpNode::operator()(
       tmp_outs(grad_outputs_names.size());
   VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
   for (size_t i = 0; i < OutputMeta().size(); i++) {
-    if (map[0].find(i) != map[0].end()) {
+    if (map[0][0].find(i) != map[0][0].end()) {
       VLOG(7) << "Insert grad outputs: " << i
               << " with size: " << OutputMeta()[i].size()
-              << " to tmp_outputs: " << map[0][i];
+              << " to tmp_outputs: " << map[0][0][i];
       for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
         outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
                              std::make_shared<phi::DenseTensor>(
                                  phi::DataType::UNDEFINED),
                              egr::Controller::Instance().GenerateUniqueName(
                                  "custom_tmp_grad"));
+        egr::EagerUtils::autograd_meta(&(outs[i][j]));
       }
-      tmp_outs[map[0][i]] = outs[i];
+      tmp_outs[map[0][0][i]] = outs[i];
     }
   }
   for (size_t i = 0; i < tmp_outs.size(); i++) {
     VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
     ctx.EmplaceBackOutputs(tmp_outs[i]);
   }
-  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_;
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad";
 
   (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
       kernel_map.at(op_type_)[1]))(&ctx);
+
+  VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
+  std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
+  std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
+  VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
+  ins_auto_grad_metas.resize(ctx.InputRange().size());
+  VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
+  outs_auto_grad_metas.resize(ctx.OutputRange().size());
+
+  for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+    ins_auto_grad_metas[i] =
+        egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
+            ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
+  }
+
+  for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
+    outs_auto_grad_metas[i] =
+        egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
+            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+  }
+  bool require_any_grad = false;
+  bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;
+  for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    require_any_grad =
+        require_any_grad || egr::EagerUtils::ComputeRequireGrad(
+                                trace_backward, &(ins_auto_grad_metas[i]));
+  }
+
+  if (require_any_grad) {
+    auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+    const auto& vec_map = meta_info_map.at(op_type_);
+    paddle::platform::RecordEvent node_creation_record_event(
+        "Custom Op " + op_type_ + " double_grad node_creation",
+        paddle::platform::TracerEventType::OperatorInner, 1);
+    VLOG(6) << " Construct Grad for Custom Op: " << op_type_;
+    ConstructFwdAndBwdMap(vec_map, op_type_);
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+    }
+    auto grad_node = std::make_shared<egr::RunCustomOpDoubleGradNode>(
+        outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type_);
+
+    auto slot_map =
+        egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+    // Prepare Grad outputs
+    size_t no_grad_cnt = 0;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& in_tensors =
+          ctx.InputsBetween(ctx.InputRangeAt(i).first,
+                            ctx.InputRangeAt(i).second);
+
+      if (slot_map[1][0].find(i) != slot_map[1][0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[1][0][i]);
+      } else {
+        grad_node->SetGradOutMeta(in_tensors,
+                                  ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        no_grad_cnt++;
+      }
+    }
+
+    // Prepare Grad inputs with grad of fwd outputs
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& out_tensors =
+          ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first,
+                              ctx.OutputRangeAt(i).second);
+      egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
+      egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+      grad_node->SetGradInMeta(out_tensors, i);
+      egr::EagerUtils::CheckAndRetainGrad(out_tensors);
+    }
+
+    // Prepare Grad inputs with fwd outputs
+    for (auto it = slot_map[1][2].begin(); it != slot_map[1][2].end(); it++) {
+      VLOG(7) << "Prepare fwd_outs: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_outs[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first,
+                                  ctx.OutputRangeAt(it->first).second));
+    }
+
+    // Prepare Grad inputs with fwd inputs
+    for (auto it = slot_map[1][3].begin(); it != slot_map[1][3].end(); it++) {
+      VLOG(7) << "Prepare fwd_ins: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_ins[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.InputsBetween(ctx.InputRangeAt(it->first).first,
+                                ctx.InputRangeAt(it->first).second));
+    }
+
+    auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(
+        meta_info_map.at(op_type_)[2]);
+    std::vector<paddle::any> attrs(attrs_names.size());
+    // Prepare attrs for Grad node
+    for (auto it = slot_map[1][4].begin(); it != slot_map[1][4].end(); it++) {
+      VLOG(7) << "Prepare fwd attrs: " << it->first
+              << " to grad_attrs: " << it->second;
+      attrs[it->second] = attrs_[it->first];
+    }
+    grad_node->SetAttrs(attrs);
+  }
+
+  return outs;
+}
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+RunCustomOpDoubleGradNode::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,
+    bool create_graph, bool is_new_grad) {  // NOLINT
+  paddle::CustomOpKernelContext ctx;
+  auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  const auto& vec_map = meta_info_map.at(op_type_);
+  auto grad_inputs_name =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
+  auto grad_outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+  auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
+
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_ins(grad_inputs_name.size());
+  VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
+          << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
+
+  auto hooked_grads = ApplyGradientHooks(grads);
+
+  for (size_t i = 0; i < hooked_grads.size(); i++) {
+    if (map[1][1].find(i) != map[1][1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][1][i];
+      tmp_ins[map[1][1][i]] = hooked_grads[i];
+    }
+  }
+
+  for (auto it : fwd_outs) {
+    VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
+  }
+
+  for (auto it : fwd_ins) {
+    VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
+  }
+
+  VLOG(6) << "Prepare Grad inputs";
+  for (const auto& in : tmp_ins) {
+    ctx.EmplaceBackInputs(in);
+  }
+  VLOG(6) << "Prepare Grad attrs";
+  ctx.EmplaceBackAttrs(attrs_);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs(OutputMeta().size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_outs(grad_outputs_names.size());
+  VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
+
+  for (const auto& name : grad_outputs_names) {
+    VLOG(6) << "Prepare Grad outputs name is: " << name;
+  }
+
+  for (size_t i = 0; i < OutputMeta().size(); i++) {
+    if (map[1][0].find(i) != map[1][0].end()) {
+      VLOG(7) << "Insert grad outputs: " << i
+              << " with size: " << OutputMeta()[i].size()
+              << " to tmp_outputs: " << map[1][0][i];
+      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
+        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                             std::make_shared<phi::DenseTensor>(
+                                 phi::DataType::UNDEFINED),
+                             egr::Controller::Instance().GenerateUniqueName(
+                                 "custom_tmp_grad"));
+      }
+      tmp_outs[map[1][0][i]] = outs[i];
+    }
+  }
+  for (size_t i = 0; i < tmp_outs.size(); i++) {
+    VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
+    ctx.EmplaceBackOutputs(tmp_outs[i]);
+  }
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << name();
+
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
+      kernel_map.at(op_type_)[2]))(&ctx);
+
   return outs;
 }
 }  // namespace egr
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
index 4801088e51ba5..feea23730676e 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -67,7 +67,11 @@ class RunCustomOpNode : public GradNodeBase {
     return res;
   }
 
-  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  void ClearTensorWrappers() override {
+    fwd_outs.clear();
+    fwd_ins.clear();
+    grads2grad_in_map.clear();
+  }
 
   void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
 
@@ -87,4 +91,75 @@ class RunCustomOpNode : public GradNodeBase {
   std::string op_type_{""};
 };
 
+class RunCustomOpDoubleGradNode : public GradNodeBase {
+ public:
+  // Constructor: configure fwd input tensors to grad node
+  explicit RunCustomOpDoubleGradNode(size_t bwd_in_slot_num,
+                                     size_t bwd_out_slot_num,
+                                     const std::string& op_type)
+      : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) {
+    VLOG(6) << "Construct RunCustomOpDoubleGradNode for op: " << op_type;
+  }
+
+  ~RunCustomOpDoubleGradNode() override {
+    VLOG(6) << "Destruct RunCustomOpDoubleGradNode for op: " << op_type_;
+  }
+
+  // Functor: perform backward computations
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(  // NOLINT
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>& grads,  // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false)  // NOLINT
+      override;
+
+  std::string name() {
+    return paddle::string::Sprintf("RunCustomOpDoubleGradNode: %s_grad_grad",
+                                   op_type_);
+  }
+
+  static std::vector<egr::TensorWrapper> ConstructTensorWrapper(
+      const std::vector<paddle::experimental::Tensor>& fwd_var) {
+    std::vector<egr::TensorWrapper> res;
+    for (auto const& var : fwd_var) {
+      res.emplace_back(var);
+    }
+    return res;
+  }
+
+  static std::vector<paddle::experimental::Tensor> Recover(
+      std::vector<egr::TensorWrapper>* fwd_var) {
+    std::vector<paddle::experimental::Tensor> res;
+    for (size_t i = 0; i < fwd_var->size(); i++) {
+      res.emplace_back(fwd_var->at(i).recover());
+    }
+    return res;
+  }
+
+  void ClearTensorWrappers() override {
+    fwd_outs.clear();
+    fwd_ins.clear();
+    grads2grad_in_map.clear();
+  }
+
+  void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node = std::shared_ptr<RunCustomOpDoubleGradNode>(
+        new RunCustomOpDoubleGradNode(*this));
+    return copied_node;
+  }
+
+ public:
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_outs;
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_ins;
+  std::unordered_map<int, int> grads2grad_in_map;
+
+ private:
+  std::vector<paddle::any> attrs_;
+  std::string op_type_{""};
+};
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index b11acae566d74..dd9881fcd5f0f 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -21,24 +21,176 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/core/compat/convert_utils.h"
+
+namespace egr {
+
+/**
+ * VariableCompatTensor class is used by Eager mode for now. It's painful to
+ * do this in Eager Mode, the better choice is to design the special Tensor
+ * directly in phi and use it in paddle::experimental::Tensor.
+ * However, we have some special operators, and they use special input variable
+ * type, such as vector<string>, unordered_map<wstring, int>, these type cannot
+ * cover by DenseTensor or SparseTensor. So, we have to provide a compatible
+ * Tensor type like variable to support these special input type. We should
+ * remove this as soon as we finish the ResourceTensor in phi.
+ *
+ * Note: Keep this class as clean as possible.
+ * This class should only support method declared in framework::Variable and
+ * necessary overridden methods.
+ *
+ * Note: This class is only used to support types that cannot be supported by
+ * the phi Tensor system temporarily. You CANNOT use this class to handle types
+ * such as DenseTensor, SelectedRows, etc.
+ **/
+class VariableCompatTensor
+    : public phi::TensorBase,
+      public phi::TypeInfoTraits<phi::TensorBase, VariableCompatTensor> {
+ public:
+  template <typename T>
+  const T& Get() const {
+    static_assert(
+        paddle::framework::IsRegisteredVarType<T>(),
+        "Not registered type. Please register T inside var_type_traits.h");
+    PADDLE_ENFORCE_NOT_NULL(holder_, paddle::platform::errors::NotFound(
+                                         "Variable is not initialized."));
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), paddle::framework::VarTypeTrait<T>::kId,
+        paddle::platform::errors::InvalidArgument(
+            "The Variable type must be %s, but the type it holds is %s.",
+            paddle::framework::ToTypeName(
+                paddle::framework::VarTypeTrait<T>::kId),
+            paddle::framework::ToTypeName(holder_->Type())));
+    return *static_cast<const T*>(holder_->Ptr());
+  }
+
+  bool IsInitialized() const { return holder_ != nullptr; }
+
+  template <typename T>
+  T* GetMutable() {
+    if (!holder_) {
+      holder_.reset(new PlaceholderImpl<T>());
+    } else {
+      PADDLE_ENFORCE_EQ(
+          holder_->Type(), paddle::framework::VarTypeTrait<T>::kId,
+          paddle::platform::errors::InvalidArgument(
+              "The Variable type must be %s, but the type it holds is %s.",
+              paddle::framework::ToTypeName(
+                  paddle::framework::VarTypeTrait<T>::kId),
+              paddle::framework::ToTypeName(holder_->Type())));
+    }
+    return static_cast<T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  bool IsType() const {
+    return holder_ &&
+           holder_->Type() == paddle::framework::VarTypeTrait<T>::kId;
+  }
+
+  void Clear() { holder_.reset(); }
+
+  int Type() const {
+    PADDLE_ENFORCE_NOT_NULL(holder_, paddle::platform::errors::NotFound(
+                                         "Variable is not initialized."));
+    return holder_->Type();
+  }
+
+  // necessary overridden methods
+
+  static const char* name() { return "VariableCompatTensor"; }
+
+  ~VariableCompatTensor() override = default;
+
+  int64_t numel() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `numel` method."));
+  }
+
+  const phi::DDim& dims() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `dims` method."));
+  }
+
+  phi::DataType dtype() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `dtype` method."));
+  }
+
+  phi::DataLayout layout() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `layout` method."));
+  }
+
+  const phi::Place& place() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `place` method."));
+  }
+
+  bool valid() const override { return IsInitialized(); }
+
+  bool initialized() const override { return IsInitialized(); }
+
+  void* AllocateFrom(phi::Allocator* allocator, phi::DataType dtype,
+                     size_t requested_size = 0) override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `AllocateFrom` method."));
+  }
+
+ private:
+  struct Placeholder {
+    virtual ~Placeholder() PADDLE_MAY_THROW {}
+
+    inline int Type() const { return type_; }
+    inline const void* Ptr() const { return ptr_; }
+    inline void* Ptr() { return ptr_; }
+
+   protected:
+    inline void Init(void* p, int type) {
+      ptr_ = p;
+      type_ = type;
+    }
+
+    void* ptr_;
+    int type_;
+  };
+
+  // Placeholder hides type T, so it doesn't appear as a template
+  // parameter of Variable.
+  template <typename T>
+  struct PlaceholderImpl : public Placeholder {
+    static_assert(
+        paddle::framework::IsRegisteredVarType<T>(),
+        "Not registered type. Please register T inside var_type_traits.h");
+    PlaceholderImpl() {
+      this->Init(&obj_, paddle::framework::VarTypeTrait<T>::kId);
+    }
+
+   private:
+    T obj_;
+  };
+
+  // pointers to a PlaceholderImpl object indeed.
+  std::shared_ptr<Placeholder> holder_;
+};
+
+inline bool IsVariableCompatTensor(const paddle::experimental::Tensor& tensor) {
+  return VariableCompatTensor::classof(tensor.impl().get());
+}
+
 /**
  * This class is used by Eager mode for now. It's painful to do this in Eager
- * Mode, the better
- * choice is to use paddle::experimental::Tensor directly. However, we have a
- * punch of nested kernel code, and
- * they use paddle::framework::Variable in inner logic code. So, we have to
- * provide variable in
- * paddle::framework::ExecutionContext to support it. We should remove this as
- * soon as we finish our latest
- * Phi Lib, and use paddle::experimental::Tensor instead.
+ * Mode, the better choice is to use paddle::experimental::Tensor directly.
+ * However, we have a punch of nested kernel code, and they use
+ * paddle::framework::Variable in inner logic code. So, we have to provide
+ * variable in paddle::framework::ExecutionContext to support it. We should
+ * remove this as soon as we finish our latest Phi Lib, and use
+ * paddle::experimental::Tensor instead.
  *
  * Note: Keep this class as clean as possible.
  * This class should only support method declared in
  * paddle::experimental::Tensor with access method of
  * paddle::framework::Variable no more members are acceptable.
  * **/
-
-namespace egr {
 class EagerVariable final {
  public:
   /* Default constructor and name constructor should only be used for contruct
@@ -54,6 +206,14 @@ class EagerVariable final {
         ConstructVariableFromTensor<phi::DenseTensor>(tensor);
       } else if (tensor.is_selected_rows()) {
         ConstructVariableFromTensor<phi::SelectedRows>(tensor);
+      } else if (IsVariableCompatTensor(tensor) &&
+                 static_cast<const VariableCompatTensor*>(tensor.impl().get())
+                     ->IsType<paddle::framework::Vocab>()) {
+        ConstructVariableFromCompatTensor<paddle::framework::Vocab>(tensor);
+      } else if (IsVariableCompatTensor(tensor) &&
+                 static_cast<const VariableCompatTensor*>(tensor.impl().get())
+                     ->IsType<paddle::framework::Strings>()) {
+        ConstructVariableFromCompatTensor<paddle::framework::Strings>(tensor);
       } else {
         PADDLE_THROW(paddle::platform::errors::Fatal(
             "Unrecognized egr::EagerVariable type, only "
@@ -119,6 +279,22 @@ class EagerVariable final {
     *framework_tensor = *tensor_dense;
   }
 
+  template <typename VarType>
+  void ConstructVariableFromCompatTensor(
+      const paddle::experimental::Tensor& tensor) {
+    auto* framework_holder = var_.GetMutable<VarType>();
+    // Contruct framework::Tensor from egr::EagerVariable
+    auto* compat_tensor =
+        static_cast<VariableCompatTensor*>(tensor.impl().get());
+    PADDLE_ENFORCE_NOT_NULL(compat_tensor,
+                            paddle::platform::errors::Fatal(
+                                "Tensor %s holds empty impl, this should not "
+                                "happend since we should "
+                                "treat all kinds of tensor as what they are.",
+                                tensor.name()));
+    *framework_holder = compat_tensor->Get<VarType>();
+  }
+
  private:
   std::string name_{""};
   paddle::framework::Variable var_;
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
new file mode 100644
index 0000000000000..d676955016684
--- /dev/null
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/nan_inf_utils.h"
+
+#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace egr {
+
+void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
+  if (tensor.initialized()) {
+    auto& tensor_name = tensor.name();
+    const phi::DenseTensor* dense_tensor{nullptr};
+    if (tensor.is_dense_tensor()) {
+      dense_tensor = static_cast<const phi::DenseTensor*>(tensor.impl().get());
+    } else if (tensor.is_selected_rows()) {
+      dense_tensor = &(
+          static_cast<const phi::SelectedRows*>(tensor.impl().get())->value());
+    } else {
+      VLOG(10) << "Only DenseTensor or SelectedRows need to check, "
+               << tensor_name << " is no need.";
+      return;
+    }
+
+    auto& place = dense_tensor->place();
+    if (paddle::platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      paddle::framework::details::tensor_check<
+          paddle::platform::CUDADeviceContext>(api_name, tensor_name,
+                                               *dense_tensor, place);
+#else
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
+          tensor_name));
+#endif
+      return;
+    }
+    paddle::framework::details::tensor_check<
+        paddle::platform::CPUDeviceContext>(api_name, tensor_name,
+                                            *dense_tensor, place);
+  }
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTwoTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfThreeTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfFourTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<3>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfFiveTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<3>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<4>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfSixTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<3>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<4>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<5>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const std::vector<Tensor>& tensors) {
+  for (auto& tensor : tensors) {
+    CheckTensorHasNanOrInf(api_name, tensor);
+  }
+}
+
+void CheckTensorHasNanOrInf(
+    const std::string& api_name,
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>& tensors) {
+  for (auto& tensor_vector : tensors) {
+    CheckTensorHasNanOrInf(api_name, tensor_vector);
+  }
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
new file mode 100644
index 0000000000000..5309eeb2959dc
--- /dev/null
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "paddle/fluid/eager/type_defs.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/utils/small_vector.h"
+
+namespace egr {
+
+using paddle::experimental::Tensor;
+using TupleOfTwoTensors = std::tuple<Tensor, Tensor>;
+using TupleOfThreeTensors = std::tuple<Tensor, Tensor, Tensor>;
+using TupleOfFourTensors = std::tuple<Tensor, Tensor, Tensor, Tensor>;
+using TupleOfFiveTensors = std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>;
+using TupleOfSixTensors =
+    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>;
+
+void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTwoTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfThreeTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfFourTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfFiveTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfSixTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const std::vector<Tensor>& tensors);
+
+void CheckTensorHasNanOrInf(
+    const std::string& api_name,
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>& tensors);
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 28b116b41ea91..8893e0ed7ee0a 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -34,7 +34,6 @@ class TensorWrapper {
  public:
   TensorWrapper() = default;
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
-                         bool full_reserved = false,
                          bool no_need_buffer = false) {
     // set inplace_version_snapshot_ according to tensor's current inplace
     // version.
@@ -46,32 +45,12 @@ class TensorWrapper {
     }
 
     /**
-     * Normally, we should fully reserved all non-output or non-leaf fwd tensor
-     * here. And for fwd output tensor, we should not reserve its autogradmeta,
-     * to avoid recursive depends on GradNodeBase
+     * Normally, we should only save data and part of autograd_meta of fwd
+     * tensor, and should not reserve its original grad_node,
+     * to avoid recursive and additional depends on GradNodeBase
      * **/
-    full_reserved_ = full_reserved;
+    auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
     no_need_buffer_ = no_need_buffer;
-    if (full_reserved_) {
-      VLOG(6) << "Fully reserved tensor: " << tensor.name();
-      intermidiate_tensor_ = tensor;
-      if (no_need_buffer_) {
-        if (phi::DenseTensor::classof(tensor.impl().get())) {
-          // Only Copy Meta
-          phi::DenseTensor* dense_tensor =
-              static_cast<phi::DenseTensor*>(tensor.impl().get());
-          auto tw_dense_tensor =
-              std::make_shared<phi::DenseTensor>(*dense_tensor);
-          tw_dense_tensor->clear();
-          intermidiate_tensor_.set_impl(tw_dense_tensor);
-        } else {
-          PADDLE_THROW(paddle::platform::errors::Fatal(
-              "Unrecognized tensor type for no_need_buffer feature"));
-        }
-      }
-      return;
-    }
-
     // shallow copy tensor_impl here
     if (no_need_buffer) {
       if (phi::DenseTensor::classof(tensor.impl().get())) {
@@ -89,10 +68,11 @@ class TensorWrapper {
       intermidiate_tensor_.set_impl(tensor.impl());
     }
 
-    // TODO(jiabin): This may has server performance issue
-    intermidiate_tensor_.set_name(tensor.name() + "@Saved");
+    if (VLOG_IS_ON(7)) {
+      // TODO(jiabin): This may has server performance issue
+      intermidiate_tensor_.set_name(tensor.name() + "@Saved");
+    }
 
-    auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
     if (tensor_autograd_meta) {
       auto autograd_meta =
           std::make_shared<AutogradMeta>(*tensor_autograd_meta);
@@ -112,27 +92,28 @@ class TensorWrapper {
 
     check_inplace_version();
 
-    // if it's full_reserved just return the full copy of tensor
-    if (full_reserved_) {
-      return intermidiate_tensor_;
+    paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
+
+    std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
+    if (new_grad_node) {
+      VLOG(3) << "Recovered TensorWrapper with GradNode "
+              << new_grad_node->name() << " addr: " << new_grad_node.get();
     } else {
-      paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
+      VLOG(3) << "Recovered TensorWrapper with Empty GradNode";
+    }
+    auto* intermediate_autograd_meta =
+        EagerUtils::nullable_autograd_meta(intermidiate_tensor_);
 
-      std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
-      auto* intermediate_autograd_meta =
-          EagerUtils::unsafe_autograd_meta(intermidiate_tensor_);
+    if (intermediate_autograd_meta) {
       auto p_ab_autograd_meta =
           std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
       if (new_grad_node) {
-        VLOG(3) << "Recovered TensorWrapper with GradNode "
-                << new_grad_node->name() << " addr: " << new_grad_node.get();
         p_ab_autograd_meta->SetGradNode(new_grad_node);
-      } else {
-        VLOG(3) << "Recovered TensorWrapper with Empth GradNode";
       }
       recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
-      return recovered_tensor;
     }
+
+    return recovered_tensor;
   }
 
   void clear() { intermidiate_tensor_.reset(); }
@@ -173,7 +154,6 @@ class TensorWrapper {
   }
 
  private:
-  bool full_reserved_ = false;
   bool no_need_buffer_ = false;
   paddle::experimental::Tensor intermidiate_tensor_;
   std::weak_ptr<egr::GradNodeBase> weak_grad_node_;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index a9a50a3621767..edbb441f27a08 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -233,3 +233,88 @@ TEST(EagerVariable, DataLayout) {
   layout = paddle::imperative::GetDataLayout(eager_var);
   CHECK_EQ(layout, paddle::experimental::DataLayout::NCHW);
 }
+
+TEST(VariableCompatTensor, MemberFunction) {
+  egr::VariableCompatTensor var_tensor;
+  // test GetMutable and Get
+  var_tensor.GetMutable<paddle::framework::Vocab>();
+  auto& vocab = var_tensor.Get<paddle::framework::Vocab>();
+  EXPECT_EQ(vocab.size(), 0UL);
+  bool caught_exception = false;
+  try {
+    var_tensor.GetMutable<paddle::framework::Strings>();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("The Variable type must be") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  // test Type and IsType
+  EXPECT_TRUE(var_tensor.IsType<paddle::framework::Vocab>());
+  EXPECT_EQ(var_tensor.Type(),
+            static_cast<int>(paddle::framework::proto::VarType::VOCAB));
+  // test valid and initialized
+  EXPECT_TRUE(var_tensor.IsInitialized());
+  EXPECT_TRUE(var_tensor.valid());
+  EXPECT_TRUE(var_tensor.initialized());
+  // test name
+  EXPECT_EQ(var_tensor.name(), "VariableCompatTensor");
+  // test other throw error methods
+  caught_exception = false;
+  try {
+    var_tensor.numel();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("numel") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.dims();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("dims") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.dtype();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("dtype") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.layout();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("layout") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.place();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("place") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.AllocateFrom(nullptr, phi::DataType::UNDEFINED);
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("AllocateFrom") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  // test Clear
+  var_tensor.Clear();
+  EXPECT_FALSE(var_tensor.IsInitialized());
+}
diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
index 5f563edee39f1..28c3472f90d03 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
@@ -40,9 +40,11 @@ TEST(TensorWrapper, Basic) {
   auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
   et1.set_autograd_meta(auto_grad0);
   et1.set_name("et1");
-  auto tw0 = egr::TensorWrapper(et1, true);
+  auto tw0 = egr::TensorWrapper(et1);
   auto recover_et1 = tw0.recover();
-  CHECK_EQ(recover_et1.name(), std::string("et1"));
+  if (VLOG_IS_ON(7)) {
+    CHECK_EQ(recover_et1.name(), std::string("et1@saved"));
+  }
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).first,
            egr::EagerUtils::OutRankInfo(et1).first);
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).second,
@@ -68,13 +70,15 @@ TEST(TensorWrapper, Basic) {
   et2.set_autograd_meta(auto_grad1);
   auto tw1 = egr::TensorWrapper(et2, false);
   auto recover_et2 = tw1.recover();
-  CHECK_EQ(recover_et2.name(), std::string("et2@Saved"));
+  if (VLOG_IS_ON(7)) {
+    CHECK_EQ(recover_et2.name(), std::string("et2@Saved"));
+  }
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).first,
            egr::EagerUtils::OutRankInfo(et2).first);
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).second,
            egr::EagerUtils::OutRankInfo(et2).second);
   // Test Raw recover
   paddle::experimental::Tensor et3;
-  auto tw2 = egr::TensorWrapper(et3, true);
+  auto tw2 = egr::TensorWrapper(et3);
   CHECK(tw2.recover().initialized() == false);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index 5a09ffd6a1e5f..719ef6673c07d 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -1,6 +1,7 @@
 cc_test(test_egr_task_tensor_utils SRCS tensor_utils_test.cc DEPS ${eager_deps})
 cc_test(test_egr_task_eager_utils SRCS eager_utils_test.cc DEPS ${eager_deps})
 cc_test(test_egr_task_forward_autograd SRCS forward_autograd_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
+cc_test(test_egr_task_nan_inf_utils SRCS nan_inf_utils_test.cc DEPS eager_nan_inf_utils)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
diff --git a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
new file mode 100644
index 0000000000000..be0563fbeedb4
--- /dev/null
+++ b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <limits>
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/nan_inf_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/include/strings_api.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(strings_empty, CPU, ALL_LAYOUT);
+
+namespace egr {
+
+#define CHECK_NAN_INF(tensors)                                         \
+  {                                                                    \
+    bool caught_exception = false;                                     \
+    try {                                                              \
+      CheckTensorHasNanOrInf("nan_inf_test", tensors);                 \
+    } catch (paddle::platform::EnforceNotMet & error) {                \
+      caught_exception = true;                                         \
+      std::string ex_msg = error.what();                               \
+      EXPECT_TRUE(ex_msg.find("There are `nan` or `inf` in tensor") != \
+                  std::string::npos);                                  \
+    }                                                                  \
+    EXPECT_TRUE(caught_exception);                                     \
+  }
+
+#define CHECK_NO_NAN_INF(tensors)                                      \
+  {                                                                    \
+    bool caught_exception = false;                                     \
+    try {                                                              \
+      CheckTensorHasNanOrInf("nan_inf_test", tensors);                 \
+    } catch (paddle::platform::EnforceNotMet & error) {                \
+      caught_exception = true;                                         \
+      std::string ex_msg = error.what();                               \
+      EXPECT_TRUE(ex_msg.find("There are `nan` or `inf` in tensor") != \
+                  std::string::npos);                                  \
+    }                                                                  \
+    EXPECT_FALSE(caught_exception);                                    \
+  }
+
+TEST(NanInfUtils, Functions) {
+  // test all methods
+  auto tensor = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  CHECK_NAN_INF(tensor);
+  auto tensor1 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto two_tensors = std::make_tuple(tensor, tensor1);
+  CHECK_NAN_INF(two_tensors);
+  auto tensor2 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto three_tensors = std::make_tuple(tensor, tensor1, tensor2);
+  CHECK_NAN_INF(three_tensors);
+  auto tensor3 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto four_tensors = std::make_tuple(tensor, tensor1, tensor2, tensor3);
+  CHECK_NAN_INF(four_tensors);
+  auto tensor4 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto five_tensors =
+      std::make_tuple(tensor, tensor1, tensor2, tensor3, tensor4);
+  CHECK_NAN_INF(five_tensors);
+  auto tensor5 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto six_tensors =
+      std::make_tuple(tensor, tensor1, tensor2, tensor3, tensor4, tensor5);
+  CHECK_NAN_INF(six_tensors);
+  std::vector<paddle::experimental::Tensor> tensor_vec;
+  tensor_vec.emplace_back(tensor);
+  tensor_vec.emplace_back(tensor1);
+  CHECK_NAN_INF(tensor_vec);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      small_vec;
+  small_vec.emplace_back(tensor_vec);
+  CHECK_NAN_INF(small_vec);
+  // test selected_rows
+  paddle::experimental::Tensor tensor_sr;
+  auto sr = std::make_shared<phi::SelectedRows>();
+  *sr->mutable_value() =
+      *(static_cast<const phi::DenseTensor*>(tensor.impl().get()));
+  tensor_sr.set_impl(sr);
+  CHECK_NAN_INF(tensor_sr);
+  // test other tensor
+  auto tensor_str = paddle::experimental::strings::empty({3, 4});
+  CHECK_NO_NAN_INF(tensor_str);
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/type_defs.h b/paddle/fluid/eager/type_defs.h
new file mode 100644
index 0000000000000..c57e718f1df3b
--- /dev/null
+++ b/paddle/fluid/eager/type_defs.h
@@ -0,0 +1,21 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace egr {
+
+constexpr size_t kSlotSmallVectorSize = 15U;
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 033af5c496c98..65294a8eb7abc 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -157,7 +157,7 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
     if (autograd_meta->GradNode()) {
       VLOG(7) << "Should not set grad node twice, original node is:"
               << autograd_meta->GradNode()->name()
-              << "current is: " << grad_node->name();
+              << " current is: " << grad_node->name();
     }
     autograd_meta->SetGradNode(grad_node);
   }
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 1947c669e9bb0..3f10cd7765bc1 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -152,7 +152,7 @@ class Dataset {
   virtual void DestroyPreLoadReaders() = 0;
   // set preload thread num
   virtual void SetPreLoadThreadNum(int thread_num) = 0;
-  // seperate train thread and dataset thread
+  // separate train thread and dataset thread
   virtual void DynamicAdjustChannelNum(int channel_num,
                                        bool discard_remaining_ins = false) = 0;
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 14b5662b24aeb..c4ea6a3c6bc66 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/transform.h"
 
+#if defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -28,6 +32,49 @@ struct CastDataTypeFunctor {
   }
 };
 
+#if defined(PADDLE_WITH_XPU)
+
+template <typename InType, typename OutType>
+static void XPUCastData(const framework::Tensor& in, framework::Tensor* out,
+                        const platform::XPUDeviceContext* dev_ctx) {
+  using XPUInTDType = typename XPUTypeTrait<InType>::Type;
+  using XPUOutTDType = typename XPUTypeTrait<OutType>::Type;
+  int r = xpu::cast_v2<XPUInTDType, XPUOutTDType>(
+      dev_ctx->x_context(),
+      reinterpret_cast<const XPUInTDType*>(in.data<InType>()),
+      reinterpret_cast<XPUOutTDType*>(out->mutable_data<OutType>(in.place())),
+      in.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+  dev_ctx->Wait();
+}
+
+template <typename InType>
+static void XPUTransDataType(
+    const framework::Tensor& in, framework::Tensor* out,
+    const paddle::framework::proto::VarType::Type& dst_type,
+    const platform::DeviceContext* ctx) {
+  auto* context = static_cast<const platform::XPUDeviceContext*>(ctx);
+
+#define XPUCastCallback(cpp_type, proto_type)          \
+  do {                                                 \
+    if (dst_type == proto_type) {                      \
+      XPUCastData<InType, cpp_type>(in, out, context); \
+    }                                                  \
+  } while (0)
+
+  if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 &&
+      dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 &&
+      dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) {
+    _ForEachDataType_(XPUCastCallback);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Data type (%s) is not supported in XPU when casting data type.",
+        DataTypeToString(dst_type)));
+  }
+}
+
+#endif
+
 template <typename InType>
 struct CastDataType {
   CastDataType(const framework::Tensor& in, framework::Tensor* out,
@@ -88,6 +135,34 @@ void TransDataType(const Tensor& in,
   auto dst_type = type;
   auto ctx = pool.Get(in.place());
 
+#if defined(PADDLE_WITH_XPU)
+  switch (src_type) {
+    case proto::VarType::FP16:
+      XPUTransDataType<platform::float16>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::FP32:
+      XPUTransDataType<float>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::BOOL:
+      XPUTransDataType<bool>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT16:
+      XPUTransDataType<int16_t>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT32:
+      XPUTransDataType<int>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT64:
+      XPUTransDataType<int64_t>(in, out, dst_type, ctx);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported in XPU when casting data type.",
+          DataTypeToString(src_type)));
+  }
+
+#else
+
   switch (src_type) {
     case proto::VarType::FP16:
       framework::VisitDataType(dst_type,
@@ -123,6 +198,7 @@ void TransDataType(const Tensor& in,
           "Data type (%s) is not supported when casting data type.",
           DataTypeToString(src_type)));
   }
+#endif
 }
 
 void TransComplexToReal(const proto::VarType::Type& dst_type,
@@ -131,7 +207,6 @@ void TransComplexToReal(const proto::VarType::Type& dst_type,
   auto& pool = platform::DeviceContextPool::Instance();
   auto* ctx = pool.Get(in.place());
   out->Resize(in.dims());
-
   // complex -> real
   switch (src_type) {
     case proto::VarType::COMPLEX64:
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index da72215653e75..e6790de92d054 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/scope.h"
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index f6a97160d8271..7cf11f7829da9 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -19,7 +19,9 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 08bac5d63323b..5668ab31f36b6 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -16,7 +16,7 @@
 
 #include <string>
 
-#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index d198eb1459288..7e63c5ffb9a44 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -75,7 +75,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
       in_var_handles.size(), places_.size(),
       platform::errors::PreconditionNotMet(
           "The number of input variables should be equal to the number of "
-          "places, but got the number of input variables is %zu and the the "
+          "places, but got the number of input variables is %zu and the "
           "number of places is %zu.",
           in_var_handles.size(), places_.size()));
   PADDLE_ENFORCE_EQ(
@@ -83,7 +83,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
       platform::errors::PreconditionNotMet(
           "The number of input variables should be equal to the number of "
           "output variables, but got the number of input variables is %zu and "
-          "the the number of output variables is %zu.",
+          "the number of output variables is %zu.",
           in_var_handles.size(), out_var_handles.size()));
 
   std::vector<const LoDTensor *> ins;
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 51456457d0606..d62fc1c084962 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -13,11 +13,10 @@ IF(WITH_GPU)
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
     if(WITH_PSCORE)
-        nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table hashtable_kernel)
+        nv_library(graph_gpu_ps SRCS graph_gpu_ps_table_inl.cu DEPS heter_comm table hashtable_kernel)
         nv_library(graph_sampler SRCS graph_sampler_inl.h DEPS graph_gpu_ps)
-
-        nv_test(test_cpu_query SRCS test_cpu_query.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
-        nv_library(graph_gpu_wrapper SRCS graph_gpu_wrapper.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
+        nv_library(graph_gpu_wrapper SRCS graph_gpu_wrapper.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS} graph_gpu_ps)
+        nv_test(test_cpu_query SRCS test_cpu_query.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS} graph_gpu_ps graph_gpu_wrapper)
         #ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
         #target_link_libraries(test_sample_rate heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
         #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index e7601edb0ca07..19c355c671a38 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -24,7 +24,7 @@ namespace paddle {
 namespace framework {
 struct GpuPsGraphNode {
   int64_t node_id;
-  unsigned int neighbor_size, neighbor_offset;
+  int64_t neighbor_size, neighbor_offset;
   // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
   // neighbor_size) of int64_t *neighbor_list;
 };
@@ -32,17 +32,17 @@ struct GpuPsGraphNode {
 struct GpuPsCommGraph {
   int64_t *neighbor_list;
   GpuPsGraphNode *node_list;
-  unsigned int neighbor_size, node_size;
+  int64_t neighbor_size, node_size;
   // the size of neighbor array and graph_node_list array
   GpuPsCommGraph()
       : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
   GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 unsigned int neighbor_size_, unsigned int node_size_)
+                 int64_t neighbor_size_, int64_t node_size_)
       : neighbor_list(neighbor_list_),
         node_list(node_list_),
         neighbor_size(neighbor_size_),
         node_size(node_size_) {}
-  void init_on_cpu(unsigned int neighbor_size, unsigned int node_size) {
+  void init_on_cpu(int64_t neighbor_size, int64_t node_size) {
     this->neighbor_size = neighbor_size;
     this->node_size = node_size;
     this->neighbor_list = new int64_t[neighbor_size];
@@ -208,12 +208,43 @@ struct NeighborSampleResult {
     delete[] ac_size;
     VLOG(0) << " ------------------";
   }
-  NeighborSampleResult(){};
-  ~NeighborSampleResult() {
-    // if (val != NULL) cudaFree(val);
-    // if (actual_sample_size != NULL) cudaFree(actual_sample_size);
-    // if (offset != NULL) cudaFree(offset);
+  std::vector<int64_t> get_sampled_graph(NeighborSampleQuery q) {
+    std::vector<int64_t> graph;
+    int64_t *sample_keys = new int64_t[q.len];
+    std::string key_str;
+    cudaMemcpy(sample_keys, q.key, q.len * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    int64_t *res = new int64_t[sample_size * key_size];
+    cudaMemcpy(res, val, sample_size * key_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    int *ac_size = new int[key_size];
+    cudaMemcpy(ac_size, actual_sample_size, key_size * sizeof(int),
+               cudaMemcpyDeviceToHost);  // 3, 1, 3
+    int total_sample_size = 0;
+    for (int i = 0; i < key_size; i++) {
+      total_sample_size += ac_size[i];
+    }
+    int64_t *res2 = new int64_t[total_sample_size];  // r
+    cudaMemcpy(res2, actual_val, total_sample_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);  // r
+
+    int start = 0;
+    for (int i = 0; i < key_size; i++) {
+      graph.push_back(sample_keys[i]);
+      graph.push_back(ac_size[i]);
+      for (int j = 0; j < ac_size[i]; j++) {
+        graph.push_back(res2[start + j]);
+      }
+      start += ac_size[i];  // r
+    }
+    delete[] res;
+    delete[] res2;  // r
+    delete[] ac_size;
+    delete[] sample_keys;
+    return graph;
   }
+  NeighborSampleResult(){};
+  ~NeighborSampleResult() {}
 };
 
 struct NodeQueryResult {
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index 8a0088114e2ec..9e7ee80edcd0c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -23,15 +23,17 @@
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-class GpuPsGraphTable : public HeterComm<int64_t, unsigned int, int> {
+class GpuPsGraphTable : public HeterComm<int64_t, int64_t, int> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
-      : HeterComm<int64_t, unsigned int, int>(1, resource) {
+      : HeterComm<int64_t, int64_t, int>(1, resource) {
     load_factor_ = 0.25;
     rw_lock.reset(new pthread_rwlock_t());
     gpu_num = resource_->total_device();
+    memset(global_device_map, -1, sizeof(global_device_map));
     for (int i = 0; i < gpu_num; i++) {
       gpu_graph_list.push_back(GpuPsCommGraph());
+      global_device_map[resource_->dev_id(i)] = i;
       sample_status.push_back(NULL);
       tables_.push_back(NULL);
     }
@@ -98,27 +100,20 @@ class GpuPsGraphTable : public HeterComm<int64_t, unsigned int, int> {
   NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, int64_t *key,
                                                 int sample_size, int len,
                                                 bool cpu_query_switch);
+  void init_sample_status();
+  void free_sample_status();
   NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
   void clear_graph_info();
+  void display_sample_res(void *key, void *val, int len, int sample_len);
   void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
                                                  int sample_size, int *h_left,
                                                  int *h_right,
                                                  int64_t *src_sample_res,
                                                  int *actual_sample_size);
-  // void move_neighbor_sample_result_to_source_gpu(
-  //     int gpu_id, int gpu_num, int *h_left, int *h_right,
-  //     int64_t *src_sample_res, thrust::host_vector<int> &total_sample_size);
-  // void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num,
-  //                                              int *h_left, int *h_right,
-  //                                              int *actual_sample_size,
-  //                                              int *total_sample_size);
   int init_cpu_table(const paddle::distributed::GraphParameter &graph);
-  // int load(const std::string &path, const std::string &param);
-  // virtual int32_t end_graph_sampling() {
-  //   return cpu_graph_table->end_graph_sampling();
-  // }
   int gpu_num;
   std::vector<GpuPsCommGraph> gpu_graph_list;
+  int global_device_map[32];
   std::vector<int *> sample_status;
   const int parallel_sample_size = 1;
   const int dim_y = 256;
@@ -130,5 +125,5 @@ class GpuPsGraphTable : public HeterComm<int64_t, unsigned int, int> {
 };
 }
 };
-#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
+//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
similarity index 77%
rename from paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
rename to paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
index 605019cb607fc..631ca962fae9c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -18,7 +18,7 @@
 #include <functional>
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
-//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 namespace paddle {
 namespace framework {
 /*
@@ -32,23 +32,48 @@ sample_result is to save the neighbor sampling result, its size is len *
 sample_size;
 */
 
-__global__ void get_cpu_id_index(int64_t* key, unsigned int* val,
+__global__ void get_cpu_id_index(int64_t* key, int* actual_sample_size,
                                  int64_t* cpu_key, int* sum, int* index,
                                  int len) {
   CUDA_KERNEL_LOOP(i, len) {
-    if (val[i] == ((unsigned int)-1)) {
+    if (actual_sample_size[i] == -1) {
       int old = atomicAdd(sum, 1);
       cpu_key[old] = key[i];
       index[old] = i;
+      // printf("old %d i-%d key:%lld\n",old,i,key[i]);
     }
   }
 }
 
+__global__ void get_actual_gpu_ac(int* gpu_ac, int number_on_cpu) {
+  CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(int64_t); }
+}
+
+template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+__global__ void copy_buffer_ac_to_final_place(
+    int64_t* gpu_buffer, int* gpu_ac, int64_t* val, int* actual_sample_size,
+    int* index, int* cumsum_gpu_ac, int number_on_cpu, int sample_size) {
+  assert(blockDim.x == WARP_SIZE);
+  assert(blockDim.y == BLOCK_WARPS);
+
+  int i = blockIdx.x * TILE_SIZE + threadIdx.y;
+  const int last_idx =
+      min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, number_on_cpu);
+  while (i < last_idx) {
+    actual_sample_size[index[i]] = gpu_ac[i];
+    for (int j = threadIdx.x; j < gpu_ac[i]; j += WARP_SIZE) {
+      val[index[i] * sample_size + j] = gpu_buffer[cumsum_gpu_ac[i] + j];
+    }
+    i += BLOCK_WARPS;
+  }
+}
+
 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
 __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
-                                           unsigned int* node_index,
+                                           int64_t* node_index,
                                            int* actual_size, int64_t* res,
-                                           int sample_len, int n) {
+                                           int sample_len, int n,
+                                           int default_value) {
   assert(blockDim.x == WARP_SIZE);
   assert(blockDim.y == BLOCK_WARPS);
 
@@ -58,13 +83,13 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
   curand_init(blockIdx.x, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
 
   while (i < last_idx) {
-    if (node_index[i] == (unsigned int)(-1)) {
-      actual_size[i] = 0;
+    if (node_index[i] == -1) {
+      actual_size[i] = default_value;
       i += BLOCK_WARPS;
       continue;
     }
-    int neighbor_len = graph.node_list[node_index[i]].neighbor_size;
-    int data_offset = graph.node_list[node_index[i]].neighbor_offset;
+    int neighbor_len = (int)graph.node_list[node_index[i]].neighbor_size;
+    int64_t data_offset = graph.node_list[node_index[i]].neighbor_offset;
     int offset = i * sample_len;
     int64_t* data = graph.neighbor_list;
     if (neighbor_len <= sample_len) {
@@ -86,7 +111,7 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
       }
       __syncwarp();
       for (int j = threadIdx.x; j < sample_len; j += WARP_SIZE) {
-        const int perm_idx = res[offset + j] + data_offset;
+        const int64_t perm_idx = res[offset + j] + data_offset;
         res[offset + j] = data[perm_idx];
       }
       actual_size[i] = sample_len;
@@ -96,23 +121,22 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
 }
 
 __global__ void neighbor_sample_example(GpuPsCommGraph graph,
-                                        unsigned int* node_index,
-                                        int* actual_size, int64_t* res,
-                                        int sample_len, int* sample_status,
-                                        int n, int from) {
+                                        int64_t* node_index, int* actual_size,
+                                        int64_t* res, int sample_len,
+                                        int* sample_status, int n, int from) {
   int id = blockIdx.x * blockDim.y + threadIdx.y;
   if (id < n) {
-    if (node_index[id] == (unsigned int)(-1)) {
+    if (node_index[id] == -1) {
       actual_size[id] = 0;
       return;
     }
     curandState rng;
     curand_init(blockIdx.x, threadIdx.x, threadIdx.y, &rng);
-    int index = threadIdx.x;
-    int offset = id * sample_len;
+    int64_t index = threadIdx.x;
+    int64_t offset = id * sample_len;
     int64_t* data = graph.neighbor_list;
-    int data_offset = graph.node_list[node_index[id]].neighbor_offset;
-    int neighbor_len = graph.node_list[node_index[id]].neighbor_size;
+    int64_t data_offset = graph.node_list[node_index[id]].neighbor_offset;
+    int64_t neighbor_len = graph.node_list[node_index[id]].neighbor_size;
     int ac_len;
     if (sample_len > neighbor_len)
       ac_len = neighbor_len;
@@ -220,6 +244,29 @@ int GpuPsGraphTable::init_cpu_table(
  that's what fill_dvals does.
 */
 
+void GpuPsGraphTable::display_sample_res(void* key, void* val, int len,
+                                         int sample_len) {
+  char key_buffer[len * sizeof(int64_t)];
+  char val_buffer[sample_len * sizeof(int64_t) * len +
+                  (len + len % 2) * sizeof(int) + len * sizeof(int64_t)];
+  cudaMemcpy(key_buffer, key, sizeof(int64_t) * len, cudaMemcpyDeviceToHost);
+  cudaMemcpy(val_buffer, val,
+             sample_len * sizeof(int64_t) * len +
+                 (len + len % 2) * sizeof(int) + len * sizeof(int64_t),
+             cudaMemcpyDeviceToHost);
+  int64_t* sample_val = (int64_t*)(val_buffer + (len + len % 2) * sizeof(int) +
+                                   len * sizeof(int64_t));
+  for (int i = 0; i < len; i++) {
+    printf("key %lld\n", *(int64_t*)(key_buffer + i * sizeof(int64_t)));
+    printf("index %lld\n", *(int64_t*)(val_buffer + i * sizeof(int64_t)));
+    int ac_size = *(int*)(val_buffer + i * sizeof(int) + len * sizeof(int64_t));
+    printf("sampled %d neigbhors\n", ac_size);
+    for (int j = 0; j < ac_size; j++) {
+      printf("%lld ", sample_val[i * sample_len + j]);
+    }
+    printf("\n");
+  }
+}
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     int start_index, int gpu_num, int sample_size, int* h_left, int* h_right,
     int64_t* src_sample_res, int* actual_sample_size) {
@@ -229,7 +276,7 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
       continue;
     }
     shard_len[i] = h_right[i] - h_left[i] + 1;
-    int cur_step = path_[start_index][i].nodes_.size() - 1;
+    int cur_step = (int)path_[start_index][i].nodes_.size() - 1;
     for (int j = cur_step; j > 0; j--) {
       cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage,
                       path_[start_index][i].nodes_[j].val_storage,
@@ -240,12 +287,12 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     auto& node = path_[start_index][i].nodes_.front();
     cudaMemcpyAsync(
         reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
-        node.val_storage + sizeof(int64_t) * shard_len[i],
-        node.val_bytes_len - sizeof(int64_t) * shard_len[i], cudaMemcpyDefault,
+        node.val_storage + sizeof(int64_t) * shard_len[i] +
+            sizeof(int) * (shard_len[i] + shard_len[i] % 2),
+        sizeof(int64_t) * shard_len[i] * sample_size, cudaMemcpyDefault,
         node.out_stream);
-    // resource_->remote_stream(i, start_index));
     cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-                    node.val_storage + sizeof(int) * shard_len[i],
+                    node.val_storage + sizeof(int64_t) * shard_len[i],
                     sizeof(int) * shard_len[i], cudaMemcpyDefault,
                     node.out_stream);
   }
@@ -440,15 +487,15 @@ void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
   // platform::CUDADeviceGuard guard(i);
   gpu_graph_list[i] = GpuPsCommGraph();
   sample_status[i] = NULL;
-  tables_[i] = new Table(std::max((unsigned int)1, g.node_size) / load_factor_);
+  tables_[i] = new Table(std::max((int64_t)1, g.node_size) / load_factor_);
   if (g.node_size > 0) {
     std::vector<int64_t> keys;
-    std::vector<unsigned int> offset;
+    std::vector<int64_t> offset;
     cudaMalloc((void**)&gpu_graph_list[i].node_list,
                g.node_size * sizeof(GpuPsGraphNode));
     cudaMemcpy(gpu_graph_list[i].node_list, g.node_list,
                g.node_size * sizeof(GpuPsGraphNode), cudaMemcpyHostToDevice);
-    for (unsigned int j = 0; j < g.node_size; j++) {
+    for (int64_t j = 0; j < g.node_size; j++) {
       keys.push_back(g.node_list[j].node_id);
       offset.push_back(j);
     }
@@ -460,12 +507,15 @@ void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
     gpu_graph_list[i].node_size = 0;
   }
   if (g.neighbor_size) {
-    int* addr;
-    cudaMalloc((void**)&addr, g.neighbor_size * sizeof(int));
-    cudaMemset(addr, 0, g.neighbor_size * sizeof(int));
-    sample_status[i] = addr;
-    cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
-               g.neighbor_size * sizeof(int64_t));
+    cudaError_t cudaStatus =
+        cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
+                   g.neighbor_size * sizeof(int64_t));
+    PADDLE_ENFORCE_EQ(cudaStatus, cudaSuccess,
+                      platform::errors::InvalidArgument(
+                          "ailed to allocate memory for graph on gpu "));
+    VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(int64_t)
+            << " bytes of memory for graph-edges on gpu "
+            << resource_->dev_id(i);
     cudaMemcpy(gpu_graph_list[i].neighbor_list, g.neighbor_list,
                g.neighbor_size * sizeof(int64_t), cudaMemcpyHostToDevice);
     gpu_graph_list[i].neighbor_size = g.neighbor_size;
@@ -474,6 +524,27 @@ void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
     gpu_graph_list[i].neighbor_size = 0;
   }
 }
+
+void GpuPsGraphTable::init_sample_status() {
+  for (int i = 0; i < gpu_num; i++) {
+    if (gpu_graph_list[i].neighbor_size) {
+      platform::CUDADeviceGuard guard(resource_->dev_id(i));
+      int* addr;
+      cudaMalloc((void**)&addr, gpu_graph_list[i].neighbor_size * sizeof(int));
+      cudaMemset(addr, 0, gpu_graph_list[i].neighbor_size * sizeof(int));
+      sample_status[i] = addr;
+    }
+  }
+}
+
+void GpuPsGraphTable::free_sample_status() {
+  for (int i = 0; i < gpu_num; i++) {
+    if (sample_status[i] != NULL) {
+      platform::CUDADeviceGuard guard(resource_->dev_id(i));
+      cudaFree(sample_status[i]);
+    }
+  }
+}
 void GpuPsGraphTable::build_graph_from_cpu(
     std::vector<GpuPsCommGraph>& cpu_graph_list) {
   VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = "
@@ -485,22 +556,19 @@ void GpuPsGraphTable::build_graph_from_cpu(
   clear_graph_info();
   for (int i = 0; i < cpu_graph_list.size(); i++) {
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    // platform::CUDADeviceGuard guard(i);
     gpu_graph_list[i] = GpuPsCommGraph();
     sample_status[i] = NULL;
-    // auto table =
-    //     new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
-    tables_[i] = new Table(
-        std::max((unsigned int)1, cpu_graph_list[i].node_size) / load_factor_);
+    tables_[i] = new Table(std::max((int64_t)1, cpu_graph_list[i].node_size) /
+                           load_factor_);
     if (cpu_graph_list[i].node_size > 0) {
       std::vector<int64_t> keys;
-      std::vector<unsigned int> offset;
+      std::vector<int64_t> offset;
       cudaMalloc((void**)&gpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
       cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
                  cudaMemcpyHostToDevice);
-      for (unsigned int j = 0; j < cpu_graph_list[i].node_size; j++) {
+      for (int64_t j = 0; j < cpu_graph_list[i].node_size; j++) {
         keys.push_back(cpu_graph_list[i].node_list[j].node_id);
         offset.push_back(j);
       }
@@ -512,12 +580,9 @@ void GpuPsGraphTable::build_graph_from_cpu(
       gpu_graph_list[i].node_size = 0;
     }
     if (cpu_graph_list[i].neighbor_size) {
-      int* addr;
-      cudaMalloc((void**)&addr, cpu_graph_list[i].neighbor_size * sizeof(int));
-      cudaMemset(addr, 0, cpu_graph_list[i].neighbor_size * sizeof(int));
-      sample_status[i] = addr;
       cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
                  cpu_graph_list[i].neighbor_size * sizeof(int64_t));
+
       cudaMemcpy(gpu_graph_list[i].neighbor_list,
                  cpu_graph_list[i].neighbor_list,
                  cpu_graph_list[i].neighbor_size * sizeof(int64_t),
@@ -533,8 +598,8 @@ void GpuPsGraphTable::build_graph_from_cpu(
 
 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3(
     NeighborSampleQuery q, bool cpu_switch) {
-  return graph_neighbor_sample_v2(q.gpu_id, q.key, q.sample_size, q.len,
-                                  cpu_switch);
+  return graph_neighbor_sample_v2(global_device_map[q.gpu_id], q.key,
+                                  q.sample_size, q.len, cpu_switch);
 }
 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
                                                             int64_t* key,
@@ -571,12 +636,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
   }
   platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
   platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-  // cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
-  // cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
   int* actual_sample_size = result.actual_sample_size;
   int64_t* val = result.val;
   int total_gpu = resource_->total_device();
-  // int dev_id = resource_->dev_id(gpu_id);
   auto stream = resource_->local_stream(gpu_id, 0);
 
   int grid_size = (len - 1) / block_size_ + 1;
@@ -605,9 +667,6 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
 
   split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
 
-  // fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr,
-  // key,
-  //                                                     d_idx_ptr, len);
   heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, len,
                                      stream);
   cudaStreamSynchronize(stream);
@@ -643,95 +702,47 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     of alloc_mem_i, actual_sample_size_of_x equals ((int
    *)alloc_mem_i)[shard_len + x]
     */
+
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
-                   shard_len * (1 + sample_size) * sizeof(int64_t));
-    auto& node = path_[gpu_id][i].nodes_[0];
-    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int),
-                    node.in_stream);
+                   shard_len * (1 + sample_size) * sizeof(int64_t) +
+                       sizeof(int) * (shard_len + shard_len % 2));
+    // auto& node = path_[gpu_id][i].nodes_[0];
   }
-  // auto end1 = std::chrono::steady_clock::now();
-  // auto tt = std::chrono::duration_cast<std::chrono::microseconds>(end1 -
-  // start1);
-  // VLOG(0)<< "create storage time  " << tt.count() << " us";
   walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
 
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
       continue;
     }
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     auto& node = path_[gpu_id][i].nodes_.back();
+    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int64_t),
+                    node.in_stream);
     cudaStreamSynchronize(node.in_stream);
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    // platform::CUDADeviceGuard guard(i);
-    // use the key-value map to update alloc_mem_i[0,shard_len)
-    // tables_[i]->rwlock_->RDLock();
     tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
-                    reinterpret_cast<unsigned int*>(node.val_storage),
+                    reinterpret_cast<int64_t*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
     // node.in_stream);
-    int shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
-    unsigned int* id_array = reinterpret_cast<unsigned int*>(node.val_storage);
+    int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
     int* actual_size_array = (int*)(id_array + shard_len);
-    int64_t* sample_array = (int64_t*)(actual_size_array + shard_len);
-    int sample_grid_size = (shard_len - 1) / dim_y + 1;
-    dim3 block(parallel_sample_size, dim_y);
-    dim3 grid(sample_grid_size);
-    // int sample_grid_size = shard_len / block_size_ + 1;
-    // VLOG(0)<<"in sample grid_size = "<<sample_grid_size<<" block_size
-    // ="<<block_size_<<" device = "<<resource_->dev_id(i)<<"len = "<<len;;
-    // neighbor_sample_example<<<sample_grid_size, block_size_, 0,
-    //                           resource_->remote_stream(i, gpu_id)>>>(
-    //     graph, res_array, actual_size_array, sample_array, sample_size,
-    //     shard_len);
-    neighbor_sample_example<<<grid, block, 0,
-                              resource_->remote_stream(i, gpu_id)>>>(
-        graph, id_array, actual_size_array, sample_array, sample_size,
-        sample_status[i], shard_len, gpu_id);
-  }
-  /*
-  for (int i = 0; i < total_gpu; ++i) {
-    if (h_left[i] == -1) {
-      continue;
-    }
-    // cudaStreamSynchronize(resource_->remote_stream(i, num));
-    // tables_[i]->rwlock_->UNLock();
-    platform::CUDADeviceGuard guard(i);
-    //platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    auto& node = path_[gpu_id][i].nodes_.back();
-    auto shard_len = h_right[i] - h_left[i] + 1;
-    auto graph = gpu_graph_list[i];
-    int* id_array = reinterpret_cast<int*>(node.val_storage);
-    int* actual_size_array = id_array + shard_len;
-    int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
+    int64_t* sample_array =
+        (int64_t*)(actual_size_array + shard_len + shard_len % 2);
     int sample_grid_size = (shard_len - 1) / dim_y + 1;
     dim3 block(parallel_sample_size, dim_y);
     dim3 grid(sample_grid_size);
-    // int sample_grid_size = shard_len / block_size_ + 1;
-    // VLOG(0)<<"in sample grid_size = "<<sample_grid_size<<" block_size
-    // ="<<block_size_<<" device = "<<resource_->dev_id(i)<<"len = "<<len;;
-    // neighbor_sample_example<<<sample_grid_size, block_size_, 0,
-    //                           resource_->remote_stream(i, gpu_id)>>>(
-    //     graph, res_array, actual_size_array, sample_array, sample_size,
-    //     shard_len);
     neighbor_sample_example<<<grid, block, 0,
                               resource_->remote_stream(i, gpu_id)>>>(
         graph, id_array, actual_size_array, sample_array, sample_size,
         sample_status[i], shard_len, gpu_id);
-      // neighbor_sample_example<<<grid, block, 0,
-      //                         node.in_stream>>>(
-      //   graph, id_array, actual_size_array, sample_array, sample_size,
-      //   sample_status[i], shard_len, gpu_id);
   }
-  */
 
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
       continue;
     }
-    // auto& node = path_[gpu_id][i].nodes_.back();
-    // cudaStreamSynchronize(node.in_stream);
     cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
   }
   move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
@@ -776,6 +787,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
   int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
   int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+  int default_value = 0;
+  if (cpu_query_switch) {
+    default_value = -1;
+  }
 
   cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
   cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
@@ -790,9 +805,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
   int* d_shard_actual_sample_size_ptr =
       reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
-
   split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
-
   heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, len,
                                      stream);
 
@@ -807,42 +820,33 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     if (shard_len == 0) {
       continue;
     }
-    // create_storage(gpu_id, i, shard_len * sizeof(int64_t),
-    //                shard_len * (1 + sample_size) * sizeof(int64_t));
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
-                   shard_len * (1 + sample_size) * sizeof(int64_t));
+                   shard_len * (1 + sample_size) * sizeof(int64_t) +
+                       sizeof(int) * (shard_len + shard_len % 2));
   }
   walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
 
-  // For cpu_query_switch, we need global items.
-  std::vector<thrust::device_vector<int64_t>> cpu_keys_list;
-  std::vector<thrust::device_vector<int>> cpu_index_list;
-  thrust::device_vector<int64_t> tmp1;
-  thrust::device_vector<int> tmp2;
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
-      // Insert empty object
-      cpu_keys_list.emplace_back(tmp1);
-      cpu_index_list.emplace_back(tmp2);
       continue;
     }
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     auto& node = path_[gpu_id][i].nodes_.back();
+    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int64_t),
+                    node.in_stream);
     cudaStreamSynchronize(node.in_stream);
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     // If not found, val is -1.
     tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
-                    reinterpret_cast<unsigned int*>(node.val_storage),
+                    reinterpret_cast<int64_t*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
 
-    auto shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
-    // int* id_array = reinterpret_cast<int*>(node.val_storage);
-    // int* actual_size_array = id_array + shard_len;
-    // int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
-    unsigned int* id_array = reinterpret_cast<unsigned int*>(node.val_storage);
+    int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
     int* actual_size_array = (int*)(id_array + shard_len);
-    int64_t* sample_array = (int64_t*)(actual_size_array + shard_len);
+    int64_t* sample_array =
+        (int64_t*)(actual_size_array + shard_len + shard_len % 2);
     constexpr int WARP_SIZE = 32;
     constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
     constexpr int TILE_SIZE = BLOCK_WARPS * 16;
@@ -852,24 +856,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
         WARP_SIZE, BLOCK_WARPS,
         TILE_SIZE><<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
         graph, id_array, actual_size_array, sample_array, sample_size,
-        shard_len);
-
-    // cpu_graph_table->random_sample_neighbors
-    if (cpu_query_switch) {
-      thrust::device_vector<int64_t> cpu_keys_ptr(shard_len);
-      thrust::device_vector<int> index_ptr(shard_len + 1, 0);
-      int64_t* node_id_array = reinterpret_cast<int64_t*>(node.key_storage);
-      int grid_size2 = (shard_len - 1) / block_size_ + 1;
-      get_cpu_id_index<<<grid_size2, block_size_, 0,
-                         resource_->remote_stream(i, gpu_id)>>>(
-          node_id_array, id_array,
-          thrust::raw_pointer_cast(cpu_keys_ptr.data()),
-          thrust::raw_pointer_cast(index_ptr.data()),
-          thrust::raw_pointer_cast(index_ptr.data()) + 1, shard_len);
-
-      cpu_keys_list.emplace_back(cpu_keys_ptr);
-      cpu_index_list.emplace_back(index_ptr);
-    }
+        shard_len, default_value);
   }
 
   for (int i = 0; i < total_gpu; ++i) {
@@ -879,41 +866,6 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
   }
 
-  if (cpu_query_switch) {
-    for (int i = 0; i < total_gpu; ++i) {
-      if (h_left[i] == -1) {
-        continue;
-      }
-      auto shard_len = h_right[i] - h_left[i] + 1;
-      int* cpu_index = new int[shard_len + 1];
-      cudaMemcpy(cpu_index, thrust::raw_pointer_cast(cpu_index_list[i].data()),
-                 (shard_len + 1) * sizeof(int), cudaMemcpyDeviceToHost);
-      if (cpu_index[0] > 0) {
-        int number_on_cpu = cpu_index[0];
-        int64_t* cpu_keys = new int64_t[number_on_cpu];
-        cudaMemcpy(cpu_keys, thrust::raw_pointer_cast(cpu_keys_list[i].data()),
-                   number_on_cpu * sizeof(int64_t), cudaMemcpyDeviceToHost);
-
-        std::vector<std::shared_ptr<char>> buffers(number_on_cpu);
-        std::vector<int> ac(number_on_cpu);
-        auto status = cpu_graph_table->random_sample_neighbors(
-            0, cpu_keys, sample_size, buffers, ac, false);
-
-        auto& node = path_[gpu_id][i].nodes_.back();
-        int* id_array = reinterpret_cast<int*>(node.val_storage);
-        int* actual_size_array = id_array + shard_len;
-        int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
-        for (int j = 0; j < number_on_cpu; j++) {
-          int offset = cpu_index[j + 1] * sample_size;
-          ac[j] = ac[j] / sizeof(int64_t);
-          cudaMemcpy(sample_array + offset, (int64_t*)(buffers[j].get()),
-                     sizeof(int64_t) * ac[j], cudaMemcpyHostToDevice);
-          cudaMemcpy(actual_size_array + cpu_index[j + 1], ac.data() + j,
-                     sizeof(int), cudaMemcpyHostToDevice);
-        }
-      }
-    }
-  }
   move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
                                             h_left, h_right, d_shard_vals_ptr,
                                             d_shard_actual_sample_size_ptr);
@@ -921,12 +873,95 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
       d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
       d_idx_ptr, sample_size, len);
 
+  cudaStreamSynchronize(stream);
+
+  if (cpu_query_switch) {
+    // Get cpu keys and corresponding position.
+    thrust::device_vector<int64_t> t_cpu_keys(len);
+    thrust::device_vector<int> t_index(len + 1, 0);
+    get_cpu_id_index<<<grid_size, block_size_, 0, stream>>>(
+        key, actual_sample_size, thrust::raw_pointer_cast(t_cpu_keys.data()),
+        thrust::raw_pointer_cast(t_index.data()),
+        thrust::raw_pointer_cast(t_index.data()) + 1, len);
+
+    cudaStreamSynchronize(stream);
+
+    int number_on_cpu = 0;
+    cudaMemcpy(&number_on_cpu, thrust::raw_pointer_cast(t_index.data()),
+               sizeof(int), cudaMemcpyDeviceToHost);
+    if (number_on_cpu > 0) {
+      int64_t* cpu_keys = new int64_t[number_on_cpu];
+      cudaMemcpy(cpu_keys, thrust::raw_pointer_cast(t_cpu_keys.data()),
+                 number_on_cpu * sizeof(int64_t), cudaMemcpyDeviceToHost);
+
+      std::vector<std::shared_ptr<char>> buffers(number_on_cpu);
+      std::vector<int> ac(number_on_cpu);
+
+      auto status = cpu_graph_table->random_sample_neighbors(
+          0, cpu_keys, sample_size, buffers, ac, false);
+
+      int total_cpu_sample_size = std::accumulate(ac.begin(), ac.end(), 0);
+      total_cpu_sample_size /= sizeof(int64_t);
+
+      // Merge buffers into one int64_t vector.
+      int64_t* merge_buffers = new int64_t[total_cpu_sample_size];
+      int start = 0;
+      for (int j = 0; j < number_on_cpu; j++) {
+        memcpy(merge_buffers + start, (int64_t*)(buffers[j].get()), ac[j]);
+        start += ac[j] / sizeof(int64_t);
+      }
+
+      // Copy merge_buffers to gpu.
+      thrust::device_vector<int64_t> gpu_buffers(total_cpu_sample_size);
+      thrust::device_vector<int> gpu_ac(number_on_cpu);
+      int64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data());
+      int* gpu_ac_ptr = thrust::raw_pointer_cast(gpu_ac.data());
+      cudaMemcpyAsync(gpu_buffers_ptr, merge_buffers,
+                      total_cpu_sample_size * sizeof(int64_t),
+                      cudaMemcpyHostToDevice, stream);
+      cudaMemcpyAsync(gpu_ac_ptr, ac.data(), number_on_cpu * sizeof(int),
+                      cudaMemcpyHostToDevice, stream);
+
+      // Copy gpu_buffers and gpu_ac using kernel.
+      // Kernel divide for gpu_ac_ptr.
+      int grid_size2 = (number_on_cpu - 1) / block_size_ + 1;
+      get_actual_gpu_ac<<<grid_size2, block_size_, 0, stream>>>(gpu_ac_ptr,
+                                                                number_on_cpu);
+
+      cudaStreamSynchronize(stream);
+
+      thrust::device_vector<int> cumsum_gpu_ac(number_on_cpu);
+      thrust::exclusive_scan(gpu_ac.begin(), gpu_ac.end(),
+                             cumsum_gpu_ac.begin(), 0);
+
+      constexpr int WARP_SIZE_ = 32;
+      constexpr int BLOCK_WARPS_ = 128 / WARP_SIZE_;
+      constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16;
+      const dim3 block2(WARP_SIZE_, BLOCK_WARPS_);
+      const dim3 grid2((number_on_cpu + TILE_SIZE_ - 1) / TILE_SIZE_);
+      copy_buffer_ac_to_final_place<WARP_SIZE_, BLOCK_WARPS_,
+                                    TILE_SIZE_><<<grid2, block2, 0, stream>>>(
+          gpu_buffers_ptr, gpu_ac_ptr, val, actual_sample_size,
+          thrust::raw_pointer_cast(t_index.data()) + 1,
+          thrust::raw_pointer_cast(cumsum_gpu_ac.data()), number_on_cpu,
+          sample_size);
+
+      delete[] merge_buffers;
+      delete[] cpu_keys;
+    }
+  }
+
   {
+    cudaStreamSynchronize(stream);
     platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
     platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-    thrust::device_ptr<int> t_actual_sample_size(actual_sample_size);
-    int total_sample_size =
-        thrust::reduce(t_actual_sample_size, t_actual_sample_size + len);
+
+    thrust::device_vector<int> t_actual_sample_size(len);
+    thrust::copy(actual_sample_size, actual_sample_size + len,
+                 t_actual_sample_size.begin());
+    int total_sample_size = thrust::reduce(t_actual_sample_size.begin(),
+                                           t_actual_sample_size.end());
+
     result.actual_val_mem =
         memory::AllocShared(place, total_sample_size * sizeof(int64_t));
     result.actual_val = (int64_t*)(result.actual_val_mem)->ptr();
@@ -934,14 +969,14 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     result.set_total_sample_size(total_sample_size);
 
     thrust::device_vector<int> cumsum_actual_sample_size(len);
-    thrust::exclusive_scan(t_actual_sample_size, t_actual_sample_size + len,
+    thrust::exclusive_scan(t_actual_sample_size.begin(),
+                           t_actual_sample_size.end(),
                            cumsum_actual_sample_size.begin(), 0);
     fill_actual_vals<<<grid_size, block_size_, 0, stream>>>(
         val, result.actual_val, actual_sample_size,
         thrust::raw_pointer_cast(cumsum_actual_sample_size.data()), sample_size,
         len);
   }
-
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
@@ -949,6 +984,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     }
     destroy_storage(gpu_id, i);
   }
+
   cudaStreamSynchronize(stream);
   return result;
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index 93854d7f1ec3f..a3f2a0c5b92a9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -18,41 +18,8 @@
 namespace paddle {
 namespace framework {
 #ifdef PADDLE_WITH_HETERPS
-std::string nodes[] = {
-    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
-    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
-    std::string("user\t59\ta 0.11\tb 11 14"),
-    std::string("user\t97\ta 0.11\tb 12 11"),
-    std::string("item\t45\ta 0.21"),
-    std::string("item\t145\ta 0.21"),
-    std::string("item\t112\ta 0.21"),
-    std::string("item\t48\ta 0.21"),
-    std::string("item\t247\ta 0.21"),
-    std::string("item\t111\ta 0.21"),
-    std::string("item\t46\ta 0.21"),
-    std::string("item\t146\ta 0.21"),
-    std::string("item\t122\ta 0.21"),
-    std::string("item\t49\ta 0.21"),
-    std::string("item\t248\ta 0.21"),
-    std::string("item\t113\ta 0.21")};
-char node_file_name[] = "nodes.txt";
-std::vector<std::string> user_feature_name = {"a", "b", "c", "d"};
-std::vector<std::string> item_feature_name = {"a"};
-std::vector<std::string> user_feature_dtype = {"float32", "int32", "string",
-                                               "string"};
-std::vector<std::string> item_feature_dtype = {"float32"};
-std::vector<int> user_feature_shape = {1, 2, 1, 1};
-std::vector<int> item_feature_shape = {1};
-void prepare_file(char file_name[]) {
-  std::ofstream ofile;
-  ofile.open(file_name);
-
-  for (auto x : nodes) {
-    ofile << x << std::endl;
-  }
-  ofile.close();
-}
 
+std::shared_ptr<GraphGpuWrapper> GraphGpuWrapper::s_instance_(nullptr);
 void GraphGpuWrapper::set_device(std::vector<int> ids) {
   for (auto device_id : ids) {
     device_id_mapping.push_back(device_id);
@@ -205,96 +172,35 @@ void GraphGpuWrapper::upload_batch(int idx,
   // g->build_graph_from_cpu(vec);
 }
 
-void GraphGpuWrapper::initialize() {
-  std::vector<int> device_id_mapping;
-  for (int i = 0; i < 2; i++) device_id_mapping.push_back(i);
-  int gpu_num = device_id_mapping.size();
-  ::paddle::distributed::GraphParameter table_proto;
-  table_proto.add_edge_types("u2u");
-  table_proto.add_node_types("user");
-  table_proto.add_node_types("item");
-  ::paddle::distributed::GraphFeature *g_f = table_proto.add_graph_feature();
-
-  for (int i = 0; i < user_feature_name.size(); i++) {
-    g_f->add_name(user_feature_name[i]);
-    g_f->add_dtype(user_feature_dtype[i]);
-    g_f->add_shape(user_feature_shape[i]);
-  }
-  ::paddle::distributed::GraphFeature *g_f1 = table_proto.add_graph_feature();
-  for (int i = 0; i < item_feature_name.size(); i++) {
-    g_f1->add_name(item_feature_name[i]);
-    g_f1->add_dtype(item_feature_dtype[i]);
-    g_f1->add_shape(item_feature_shape[i]);
-  }
-  prepare_file(node_file_name);
-  table_proto.set_shard_num(24);
+// void GraphGpuWrapper::test() {
+//   int64_t cpu_key[3] = {0, 1, 2};
+//   void *key;
+//   platform::CUDADeviceGuard guard(0);
+//   cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+//   cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+//   auto neighbor_sample_res =
+//       ((GpuPsGraphTable *)graph_table)
+//           ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
+//   int64_t *res = new int64_t[7];
+//   cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
+//              cudaMemcpyDeviceToHost);
+//   int *actual_sample_size = new int[3];
+//   cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
+//              3 * sizeof(int),
+//              cudaMemcpyDeviceToHost);  // 3, 1, 3
 
-  std::shared_ptr<HeterPsResource> resource =
-      std::make_shared<HeterPsResource>(device_id_mapping);
-  resource->enable_p2p();
-  GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1);
-  g->init_cpu_table(table_proto);
-  graph_table = (char *)g;
-  g->cpu_graph_table->Load(node_file_name, "nuser");
-  g->cpu_graph_table->Load(node_file_name, "nitem");
-  std::remove(node_file_name);
-  std::vector<paddle::framework::GpuPsCommGraph> vec;
-  std::vector<int64_t> node_ids;
-  node_ids.push_back(37);
-  node_ids.push_back(96);
-  std::vector<std::vector<std::string>> node_feat(2,
-                                                  std::vector<std::string>(2));
-  std::vector<std::string> feature_names;
-  feature_names.push_back(std::string("c"));
-  feature_names.push_back(std::string("d"));
-  g->cpu_graph_table->get_node_feat(0, node_ids, feature_names, node_feat);
-  VLOG(0) << "get_node_feat: " << node_feat[0][0];
-  VLOG(0) << "get_node_feat: " << node_feat[0][1];
-  VLOG(0) << "get_node_feat: " << node_feat[1][0];
-  VLOG(0) << "get_node_feat: " << node_feat[1][1];
-  int n = 10;
-  std::vector<int64_t> ids0, ids1;
-  for (int i = 0; i < n; i++) {
-    g->cpu_graph_table->add_comm_edge(0, i, (i + 1) % n);
-    g->cpu_graph_table->add_comm_edge(0, i, (i - 1 + n) % n);
-    if (i % 2 == 0) ids0.push_back(i);
-  }
-  g->cpu_graph_table->build_sampler(0);
-  ids1.push_back(5);
-  vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(0, ids0));
-  vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(0, ids1));
-  vec[0].display_on_cpu();
-  vec[1].display_on_cpu();
-  g->build_graph_from_cpu(vec);
-}
-void GraphGpuWrapper::test() {
-  int64_t cpu_key[3] = {0, 1, 2};
-  void *key;
-  platform::CUDADeviceGuard guard(0);
-  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
-  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
-  auto neighbor_sample_res =
-      ((GpuPsGraphTable *)graph_table)
-          ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
-  int64_t *res = new int64_t[7];
-  cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
-             cudaMemcpyDeviceToHost);
-  int *actual_sample_size = new int[3];
-  cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
-             3 * sizeof(int),
-             cudaMemcpyDeviceToHost);  // 3, 1, 3
-
-  //{0,9} or {9,0} is expected for key 0
-  //{0,2} or {2,0} is expected for key 1
-  //{1,3} or {3,1} is expected for key 2
-  for (int i = 0; i < 3; i++) {
-    VLOG(0) << "actual sample size for " << i << " is "
-            << actual_sample_size[i];
-    for (int j = 0; j < actual_sample_size[i]; j++) {
-      VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 + j];
-    }
-  }
-}
+//   //{0,9} or {9,0} is expected for key 0
+//   //{0,2} or {2,0} is expected for key 1
+//   //{1,3} or {3,1} is expected for key 2
+//   for (int i = 0; i < 3; i++) {
+//     VLOG(0) << "actual sample size for " << i << " is "
+//             << actual_sample_size[i];
+//     for (int j = 0; j < actual_sample_size[i]; j++) {
+//       VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 +
+//       j];
+//     }
+//   }
+// }
 NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3(
     NeighborSampleQuery q, bool cpu_switch) {
   return ((GpuPsGraphTable *)graph_table)
@@ -314,7 +220,6 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
   auto neighbor_sample_res =
       ((GpuPsGraphTable *)graph_table)
           ->graph_neighbor_sample(gpu_id, cuda_key, sample_size, key.size());
-
   int *actual_sample_size = new int[key.size()];
   cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
              key.size() * sizeof(int),
@@ -323,7 +228,6 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
   for (int i = 0; i < key.size(); i++) {
     cumsum += actual_sample_size[i];
   }
-  /* VLOG(0) << "cumsum " << cumsum; */
 
   std::vector<int64_t> cpu_key, res;
   cpu_key.resize(key.size() * sample_size);
@@ -340,11 +244,18 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
   /* for(int i = 0;i < res.size();i ++) { */
   /*     VLOG(0) << i << " " << res[i]; */
   /* } */
-
+  delete[] actual_sample_size;
   cudaFree(cuda_key);
   return res;
 }
 
+void GraphGpuWrapper::init_sample_status() {
+  ((GpuPsGraphTable *)graph_table)->init_sample_status();
+}
+
+void GraphGpuWrapper::free_sample_status() {
+  ((GpuPsGraphTable *)graph_table)->free_sample_status();
+}
 NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, int start,
                                                  int query_size) {
   return ((GpuPsGraphTable *)graph_table)
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index b638311304773..d8b11682bc8c5 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -22,10 +23,13 @@ namespace framework {
 #ifdef PADDLE_WITH_HETERPS
 class GraphGpuWrapper {
  public:
-  static GraphGpuWrapper* GetInstance() {
-    static GraphGpuWrapper wrapper;
-    return &wrapper;
+  static std::shared_ptr<GraphGpuWrapper> GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::framework::GraphGpuWrapper());
+    }
+    return s_instance_;
   }
+  static std::shared_ptr<GraphGpuWrapper> s_instance_;
   void initialize();
   void test();
   void set_device(std::vector<int> ids);
@@ -53,6 +57,8 @@ class GraphGpuWrapper {
                                              std::vector<int64_t>& key,
                                              int sample_size);
 
+  void init_sample_status();
+  void free_sample_status();
   std::unordered_map<std::string, int> edge_to_id, feature_to_id;
   std::vector<std::string> id_to_feature, id_to_edge;
   std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
@@ -62,7 +68,7 @@ class GraphGpuWrapper {
   ::paddle::distributed::GraphParameter table_proto;
   std::vector<int> device_id_mapping;
   int search_level = 1;
-  char* graph_table;
+  void* graph_table;
 };
 #endif
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index b860ea5d39cb5..e2f362d407458 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -41,9 +41,7 @@ limitations under the License. */
 #include "xpu/kernel/simd.h"
 #endif
 
-#if defined(PADDLE_WITH_XPU_KP)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
-#endif
 
 namespace paddle {
 namespace framework {
@@ -132,10 +130,8 @@ class HashTable {
 
   void show();
 
-#if defined(PADDLE_WITH_XPU_KP)
   void set_sparse_sgd(const OptimizerConfig& optimizer_config);
   void set_embedx_sgd(const OptimizerConfig& optimizer_config);
-#endif
 
   template <typename StreamType>
   void dump_to_cpu(int devid, StreamType stream);
@@ -178,9 +174,10 @@ class HashTable {
   TableContainer<KeyType, ValType>* container_;
 #elif defined(PADDLE_WITH_XPU_KP)
   XPUCacheArray<KeyType, ValType>* container_;
-  OptimizerConfig* xpu_optimizer_config_;
-  OptimizerConfig cpu_optimizer_config_;
 #endif
+  OptimizerConfig* device_optimizer_config_;
+  OptimizerConfig host_optimizer_config_;
+
   int BLOCK_SIZE_{256};
   float LOAD_FACTOR{0.75f};
   size_t capacity_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 87b62c6d380a4..5a29159aa12a8 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -95,6 +95,7 @@ __global__ void dy_mf_search_kernel(Table* table,
 
 template <typename Table, typename GradType, typename Sgd>
 __global__ void update_kernel(Table* table,
+                              const OptimizerConfig& optimizer_config,
                               const typename Table::key_type* const keys,
                               const GradType* const grads, size_t len,
                               Sgd sgd) {
@@ -102,13 +103,14 @@ __global__ void update_kernel(Table* table,
   if (i < len) {
     auto it = table->find(keys[i]);
     if (it != table->end()) {
-      sgd.update_value((it.getter())->second, grads[i]);
+      sgd.update_value(optimizer_config, (it.getter())->second, grads[i]);
     }
   }
 }
 
 template <typename Table, typename Sgd>
 __global__ void dy_mf_update_kernel(Table* table,
+                                    const OptimizerConfig& optimizer_config,
                                     const typename Table::key_type* const keys,
                                     const char* const grads, size_t len,
                                     Sgd sgd, size_t grad_value_size) {
@@ -117,7 +119,7 @@ __global__ void dy_mf_update_kernel(Table* table,
     auto it = table->find(keys[i]);
     if (it != table->end()) {
       FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
-      sgd.dy_mf_update_value((it.getter())->second, *cur);
+      sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
     } else {
       printf("yxf::push miss key: %d", keys[i]);
     }
@@ -127,6 +129,9 @@ __global__ void dy_mf_update_kernel(Table* table,
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::HashTable(size_t capacity) {
   container_ = new TableContainer<KeyType, ValType>(capacity);
+  cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig));
+  cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
+             sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
   rwlock_.reset(new phi::RWLock);
 }
 
@@ -135,6 +140,22 @@ HashTable<KeyType, ValType>::~HashTable() {
   delete container_;
 }
 
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::set_sparse_sgd(
+    const OptimizerConfig& optimizer_config) {
+  host_optimizer_config_.set_sparse_sgd(optimizer_config);
+  cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
+             sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
+}
+
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::set_embedx_sgd(
+    const OptimizerConfig& optimizer_config) {
+  host_optimizer_config_.set_embedx_sgd(optimizer_config);
+  cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
+             sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
+}
+
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::show() {
   container_->print();
@@ -279,8 +300,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-  update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys,
-                                                       d_grads, len, sgd);
+  update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, *device_optimizer_config_, d_keys, d_grads, len, sgd);
 }
 
 template <typename KeyType, typename ValType>
@@ -293,11 +314,13 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   dy_mf_update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
-      container_, d_keys, d_grads, len, sgd, push_grad_value_size_);
+      container_, *device_optimizer_config_, d_keys, d_grads, len, sgd,
+      push_grad_value_size_);
 }
 
 template class HashTable<unsigned long, paddle::framework::FeatureValue>;
 template class HashTable<long, int>;
+template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
 
@@ -312,6 +335,9 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
 
 template void HashTable<long, unsigned long>::get<cudaStream_t>(
     const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
+                                                       long* d_vals, size_t len,
+                                                       cudaStream_t stream);
 template void HashTable<long, unsigned int>::get<cudaStream_t>(
     const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream);
 // template void
@@ -328,6 +354,10 @@ template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
                                                          const int* d_vals,
                                                          size_t len,
                                                          cudaStream_t stream);
+template void HashTable<long, long>::insert<cudaStream_t>(const long* d_keys,
+                                                          const long* d_vals,
+                                                          size_t len,
+                                                          cudaStream_t stream);
 
 template void HashTable<long, unsigned long>::insert<cudaStream_t>(
     const long* d_keys, const unsigned long* d_vals, size_t len,
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
index cd43a73b44ec3..79c5f3d757781 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
@@ -163,7 +163,7 @@ __global__ void search_kernel(Table& table, const KeyType* const keys,
 }
 
 template <typename KeyType, typename ValType, typename Table, typename GradType>
-__global__ void update_kernel(OptimizerConfig& optimizer_config, Table& table,
+__global__ void update_kernel(Table& table, OptimizerConfig& optimizer_config,
                               const KeyType* const keys,
                               const GradType* const grads, long long len) {
   int cid = core_id();
@@ -202,12 +202,9 @@ HashTable<KeyType, ValType>::HashTable(size_t capacity) {
              sizeof(XPUCacheArray<KeyType, ValType>));
   xpu_memcpy((void*)container_, &tmp_container,
              sizeof(XPUCacheArray<KeyType, ValType>), XPU_HOST_TO_DEVICE);
-
-  OptimizerConfig tmp_opt_config;
-  xpu_malloc(reinterpret_cast<void**>(&xpu_optimizer_config_),
+  xpu_malloc(reinterpret_cast<void**>(&device_optimizer_config_),
              sizeof(OptimizerConfig));
-
-  xpu_memcpy((void*)xpu_optimizer_config_, &tmp_opt_config,
+  xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
              sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
 
   rwlock_.reset(new phi::RWLock);
@@ -216,7 +213,7 @@ HashTable<KeyType, ValType>::HashTable(size_t capacity) {
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::~HashTable() {
   xpu_free((void*)container_);
-  xpu_free((void*)xpu_optimizer_config_);
+  xpu_free((void*)device_optimizer_config_);
 }
 
 template <typename KeyType, typename ValType>
@@ -227,28 +224,16 @@ void HashTable<KeyType, ValType>::show() {
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::set_sparse_sgd(
     const OptimizerConfig& optimizer_config) {
-  cpu_optimizer_config_.nonclk_coeff = optimizer_config.nonclk_coeff;
-  cpu_optimizer_config_.clk_coeff = optimizer_config.clk_coeff;
-  cpu_optimizer_config_.min_bound = optimizer_config.min_bound;
-  cpu_optimizer_config_.max_bound = optimizer_config.max_bound;
-  cpu_optimizer_config_.learning_rate = optimizer_config.learning_rate;
-  cpu_optimizer_config_.initial_g2sum = optimizer_config.initial_g2sum;
-  cpu_optimizer_config_.initial_range = optimizer_config.initial_range;
-  xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_,
+  host_optimizer_config_.set_sparse_sgd(optimizer_config);
+  xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
              sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
 }
 
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::set_embedx_sgd(
     const OptimizerConfig& optimizer_config) {
-  cpu_optimizer_config_.mf_create_thresholds =
-      optimizer_config.mf_create_thresholds;
-  cpu_optimizer_config_.mf_learning_rate = optimizer_config.mf_learning_rate;
-  cpu_optimizer_config_.mf_initial_g2sum = optimizer_config.mf_initial_g2sum;
-  cpu_optimizer_config_.mf_initial_range = optimizer_config.mf_initial_range;
-  cpu_optimizer_config_.mf_min_bound = optimizer_config.mf_min_bound;
-  cpu_optimizer_config_.mf_max_bound = optimizer_config.mf_max_bound;
-  xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_,
+  host_optimizer_config_.set_embedx_sgd(optimizer_config);
+  xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
              sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
 }
 
@@ -306,7 +291,7 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
   long long c_len = (long long)len;
   update_kernel<KeyType, ValType, XPUCacheArray<KeyType, ValType>,
                 GradType><<<4, 64, stream>>>(
-      *xpu_optimizer_config_, *container_, d_keys, d_grads, c_len);
+      *container_, *device_optimizer_config_, d_keys, d_grads, c_len);
 }
 
 template <typename KeyType, typename ValType>
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 6379f7ee91264..e53a962c5abde 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -65,10 +65,8 @@ class HeterComm {
   void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len);
 #endif
 
-#if defined(PADDLE_WITH_XPU_KP)
   void set_sparse_sgd(const OptimizerConfig& optimizer_config);
   void set_embedx_sgd(const OptimizerConfig& optimizer_config);
-#endif
 
   int log2i(int x);
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 098adc2bdeb88..d23719ea9eb77 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -193,9 +193,10 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
     memory_copy(dst_place, node.key_storage, src_place,
                 reinterpret_cast<char*>(src_key + h_left[i]),
                 node.key_bytes_len, node.in_stream);
-#if defined(PADDLE_WITH_CUDA)  // adapt for gpu-graph
-    cudaMemsetAsync(node.val_storage, -1, node.val_bytes_len, node.in_stream);
-#endif
+    // #if defined(PADDLE_WITH_CUDA)  // adapt for gpu-graph
+    //     cudaMemsetAsync(node.val_storage, -1, node.val_bytes_len,
+    //     node.in_stream);
+    // #endif
 
     if (need_copy_val) {
       memory_copy(dst_place, node.val_storage, src_place,
@@ -342,7 +343,6 @@ int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
   return resource_->get_index_by_devid(devid);
 }
 
-#if defined(PADDLE_WITH_XPU_KP)
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
     const OptimizerConfig& optimizer_config) {
@@ -358,7 +358,6 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
     table->set_embedx_sgd(optimizer_config);
   }
 }
-#endif
 
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::build_ps(
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 581b0d511c23e..66e06b13b046f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -48,6 +48,14 @@ int HeterPs::get_index_by_devid(int devid) {
   return comm_->get_index_by_devid(devid);
 }
 
+void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) {
+  comm_->set_sparse_sgd(optimizer_config);
+}
+
+void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) {
+  comm_->set_embedx_sgd(optimizer_config);
+}
+
 void HeterPs::end_pass() { comm_->end_pass(); }
 
 void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 7060817be91eb..70b88350f2720 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -44,10 +44,8 @@ class HeterPs : public HeterPsBase {
                               int comm_size) override;
 #endif
 
-#if defined(PADDLE_WITH_XPU_KP)
   void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
   void set_embedx_sgd(const OptimizerConfig& optimizer_config) override;
-#endif
 
   void end_pass() override;
   int get_index_by_devid(int devid) override;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 79061ab66af1c..0727e2c2dbce1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -16,9 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
-#if defined(PADDLE_WITH_XPU_KP)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
-#endif
 
 #ifdef PADDLE_WITH_HETERPS
 
@@ -48,10 +46,8 @@ class HeterPsBase {
   virtual void push_sparse(int num, FeatureKey* d_keys,
                            FeaturePushValue* d_grads, size_t len) = 0;
 
-#if defined(PADDLE_WITH_XPU_KP)
-  virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) {}
-  virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) {}
-#endif
+  virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0;
+  virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0;
 
   static HeterPsBase* get_instance(size_t capacity,
                                    std::shared_ptr<HeterPsResource> resource);
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index ebf7dd277c7d6..065d5e6d527fc 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -35,58 +35,64 @@ class Optimizer {
 
   void initialize() {}
 
-  __device__ void update_lr(float& w, float& g2sum, float g,  // NOLINT
+  __device__ void update_lr(const OptimizerConfig& optimizer_config,
+                            float& w,               // NOLINT
+                            float& g2sum, float g,  // NOLINT
                             float scale) {
     double add_g2sum = 0;
-    double ratio = optimizer_config::learning_rate *
-                   sqrt(optimizer_config::initial_g2sum /
-                        (optimizer_config::initial_g2sum + g2sum));
+    double ratio = optimizer_config.learning_rate *
+                   sqrt(optimizer_config.initial_g2sum /
+                        (optimizer_config.initial_g2sum + g2sum));
     double scaled_grad = g / scale;
 
     w += scaled_grad * ratio;
 
-    if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
-    if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;
+    if (w < optimizer_config.min_bound) w = optimizer_config.min_bound;
+    if (w > optimizer_config.max_bound) w = optimizer_config.max_bound;
 
     add_g2sum += scaled_grad * scaled_grad;
 
     g2sum += add_g2sum;
   }
 
-  __device__ void update_mf(int n, float* w, float& g2sum,  // NOLINT
+  __device__ void update_mf(const OptimizerConfig& optimizer_config, int n,
+                            float* w,
+                            float& g2sum,  // NOLINT
                             const float* g, float scale) {
     double add_g2sum = 0;
-    double ratio = optimizer_config::mf_learning_rate *
-                   sqrt(optimizer_config::mf_initial_g2sum /
-                        (optimizer_config::mf_initial_g2sum + g2sum));
+    double ratio = optimizer_config.mf_learning_rate *
+                   sqrt(optimizer_config.mf_initial_g2sum /
+                        (optimizer_config.mf_initial_g2sum + g2sum));
     for (int i = 0; i < n; ++i) {
       double scaled_grad = g[i] / scale;
 
       w[i] += scaled_grad * ratio;
 
-      if (w[i] < optimizer_config::mf_min_bound)
-        w[i] = optimizer_config::mf_min_bound;
-      if (w[i] > optimizer_config::mf_max_bound)
-        w[i] = optimizer_config::mf_max_bound;
+      if (w[i] < optimizer_config.mf_min_bound)
+        w[i] = optimizer_config.mf_min_bound;
+      if (w[i] > optimizer_config.mf_max_bound)
+        w[i] = optimizer_config.mf_max_bound;
       add_g2sum += scaled_grad * scaled_grad;
     }
 
     g2sum += add_g2sum / n;
   }
 
-  __device__ void update_value(ValType& val, const GradType& grad) {  // NOLINT
+  __device__ void update_value(const OptimizerConfig& optimizer_config,
+                               ValType& val,  // NOLINT
+                               const GradType& grad) {
     val.slot = grad.slot;
     val.show += grad.show;
     val.clk += grad.clk;
-    val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
-                       optimizer_config::clk_coeff * grad.clk;
+    val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
+                       optimizer_config.clk_coeff * grad.clk;
 
-    update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show);
+    update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show);
 
     if (val.mf_size == 0) {
-      if (optimizer_config::mf_create_thresholds <=
-          optimizer_config::nonclk_coeff * (val.show - val.clk) +
-              optimizer_config::clk_coeff * val.clk) {
+      if (optimizer_config.mf_create_thresholds <=
+          optimizer_config.nonclk_coeff * (val.show - val.clk) +
+              optimizer_config.clk_coeff * val.clk) {
         val.mf_size = MF_DIM + 1;
         val.mf[0] = 0;
         int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -94,30 +100,31 @@ class Optimizer {
         curand_init(clock64(), tid_x, 0, &state);
         for (int i = 0; i < MF_DIM; ++i) {
           val.mf[i + 1] =
-              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+              (curand_uniform(&state)) * optimizer_config.mf_initial_range;
         }
       }
     } else {
-      update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show);
+      update_mf(optimizer_config, MF_DIM, &val.mf[1], val.mf[0], grad.mf_g,
+                grad.show);
     }
   }
 
-  __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad) {
+  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
+                                     ValType* ptr, const GradType& grad) {
     ptr->slot = grad.slot;
     ptr->show += grad.show;
     ptr->clk += grad.clk;
-    ptr->delta_score +=
-        optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
-        optimizer_config::clk_coeff * grad.clk;
+    ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
+                        optimizer_config.clk_coeff * grad.clk;
 
-    update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
+    update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
     // use MF_DIM temporarily
     // ptr->mf_dim = grad.mf_dim;
 
     if (ptr->mf_size == 0) {
-      if (optimizer_config::mf_create_thresholds <=
-          optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) +
-              optimizer_config::clk_coeff * ptr->clk) {
+      if (optimizer_config.mf_create_thresholds <=
+          optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) +
+              optimizer_config.clk_coeff * ptr->clk) {
         // ptr->mf_size = ptr->mf_dim + 1;
 
         ptr->mf_size = MF_DIM + 1;
@@ -127,11 +134,11 @@ class Optimizer {
         curand_init(clock64(), tid_x, 0, &state);
         for (int i = 0; i < MF_DIM; ++i) {
           ptr->mf[i + 1] =
-              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+              (curand_uniform(&state)) * optimizer_config.mf_initial_range;
         }
       }
     } else {
-      update_mf(MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
+      update_mf(optimizer_config, MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
                 grad.show);  // for local test
     }
   }
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
index 2a80aa4b52d91..03caeb984f7c9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
@@ -14,50 +14,69 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA)
+namespace paddle {
+namespace framework {
 
-namespace optimizer_config {
+class OptimizerConfig {
+ public:
+  float nonclk_coeff = 0.1;
+  float clk_coeff = 1;
 
-__constant__ float nonclk_coeff = 0.1;
-__constant__ float clk_coeff = 1;
+  float min_bound = -10;
+  float max_bound = 10;
+  float learning_rate = 0.05;
+  float initial_g2sum = 3.0;
+  float initial_range = 0;
 
-__constant__ float min_bound = -10;
-__constant__ float max_bound = 10;
-__constant__ float learning_rate = 0.05;
-__constant__ float initial_g2sum = 3.0;
-__constant__ float initial_range = 0;
+  float mf_create_thresholds = 10;
+  float mf_learning_rate = 0.05;
+  float mf_initial_g2sum = 3.0;
+  float mf_initial_range = 1e-4;
+  float mf_min_bound = -10;
+  float mf_max_bound = 10;
 
-__constant__ float mf_create_thresholds = 10;
-__constant__ float mf_learning_rate = 0.05;
-__constant__ float mf_initial_g2sum = 3.0;
-__constant__ float mf_initial_range = 1e-4;
-__constant__ float mf_min_bound = -10;
-__constant__ float mf_max_bound = 10;
-}  // namespace optimizer_config
+  void set_sparse_sgd(float nonclk_coeff, float clk_coeff, float min_bound,
+                      float max_bound, float learning_rate, float initial_g2sum,
+                      float initial_range) {
+    this->nonclk_coeff = nonclk_coeff;
+    this->clk_coeff = clk_coeff;
+    this->min_bound = min_bound;
+    this->max_bound = max_bound;
+    this->learning_rate = learning_rate;
+    this->initial_g2sum = initial_g2sum;
+    this->initial_range = initial_range;
+  }
 
-#elif defined(PADDLE_WITH_XPU_KP)
-namespace paddle {
-namespace framework {
+  void set_sparse_sgd(const OptimizerConfig& optimizer_config) {
+    this->nonclk_coeff = optimizer_config.nonclk_coeff;
+    this->clk_coeff = optimizer_config.clk_coeff;
+    this->min_bound = optimizer_config.min_bound;
+    this->max_bound = optimizer_config.max_bound;
+    this->learning_rate = optimizer_config.learning_rate;
+    this->initial_g2sum = optimizer_config.initial_g2sum;
+    this->initial_range = optimizer_config.initial_range;
+  }
 
-class OptimizerConfig {
- public:
-  float nonclk_coeff;
-  float clk_coeff;
-
-  float min_bound;
-  float max_bound;
-  float learning_rate;
-  float initial_g2sum;
-  float initial_range;
-
-  float mf_create_thresholds;
-  float mf_learning_rate;
-  float mf_initial_g2sum;
-  float mf_initial_range;
-  float mf_min_bound;
-  float mf_max_bound;
+  void set_embedx_sgd(float mf_create_thresholds, float mf_learning_rate,
+                      float mf_initial_g2sum, float mf_initial_range,
+                      float mf_min_bound, float mf_max_bound) {
+    this->mf_create_thresholds = mf_create_thresholds;
+    this->mf_learning_rate = mf_learning_rate;
+    this->mf_initial_g2sum = mf_initial_g2sum;
+    this->mf_initial_range = mf_initial_range;
+    this->mf_min_bound = mf_min_bound;
+    this->mf_max_bound = mf_max_bound;
+  }
+
+  void set_embedx_sgd(const OptimizerConfig& optimizer_config) {
+    this->mf_create_thresholds = optimizer_config.mf_create_thresholds;
+    this->mf_learning_rate = optimizer_config.mf_learning_rate;
+    this->mf_initial_g2sum = optimizer_config.mf_initial_g2sum;
+    this->mf_initial_range = optimizer_config.mf_initial_range;
+    this->mf_min_bound = optimizer_config.mf_min_bound;
+    this->mf_max_bound = optimizer_config.mf_max_bound;
+  }
 };
+
 }  // namespace framework
 }  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
index b3a38a6dfde49..ff3cd9d2d046d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -17,6 +17,7 @@
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
@@ -235,4 +236,9 @@ TEST(TEST_FLEET, test_cpu_cache) {
     }
     index++;
   }
+  auto iter = paddle::framework::GraphGpuWrapper::GetInstance();
+  std::vector<int> device;
+  device.push_back(0);
+  device.push_back(1);
+  iter->set_device(device);
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 64765c98fd04b..f512fcc7b9fdb 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -898,17 +898,9 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
   all_timer.Start();
   int64_t total_length =
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-#ifdef PADDLE_WITH_CUDA
-  VLOG(3) << "Begine Gpu Ps PullSparse";
+  VLOG(3) << "Begine Gpu/Xpu Ps PullSparse";
   auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue));
   FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
-#endif
-#ifdef PADDLE_WITH_XPU_KP
-  VLOG(3) << "Begine Xpu Ps PullSparse";
-  FeatureValue* total_values_gpu = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
-             total_length * sizeof(FeatureValue));
-#endif
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GpuPs now."));
@@ -969,19 +961,11 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
       slot_lengths_lod[i] += slot_lengths_lod[i - 1];
     }
 
-    uint64_t* buf_key = nullptr;
-    int64_t* buf_length = nullptr;
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&buf_key),
-                                 keys.size() * sizeof(uint64_t*)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&buf_length),
-                                 slot_lengths.size() * sizeof(int64_t)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-
-    uint64_t** xpu_keys = reinterpret_cast<uint64_t**>(&buf_key);
-    int64_t* xpu_len = reinterpret_cast<int64_t*>(buf_length);
+    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
+    auto buf_length =
+        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+    uint64_t** xpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
+    int64_t* xpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
     PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, keys.data(),
                                           keys.size() * sizeof(uint64_t*),
                                           XPU_HOST_TO_DEVICE));
@@ -997,8 +981,6 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
     pull_gpups_timer.Start();
     HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
                           static_cast<int>(total_length));
-    // PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-    //                              "PullSparseGPU failed in GPUPS."));
     pull_gpups_timer.Pause();
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
@@ -1029,22 +1011,16 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
   all_timer.Start();
   int64_t total_length =
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-#ifdef PADDLE_WITH_CUDA
+  // #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begin GPUPS PushSparseGrad";
   auto buf = memory::Alloc(place, total_length * sizeof(FeaturePushValue));
   FeaturePushValue* total_grad_values_gpu =
       reinterpret_cast<FeaturePushValue*>(buf->ptr());
-#endif
-#ifdef PADDLE_WITH_XPU_KP
-  VLOG(3) << "Begine Xpu Ps PushSparseGrad";
-  FeaturePushValue* total_grad_values_gpu = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&total_grad_values_gpu),
-             total_length * sizeof(FeaturePushValue));
-#endif
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GPUPS now."));
   } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
     int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
@@ -1060,7 +1036,9 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
                           static_cast<int>(total_length));
     push_gpups_timer.Pause();
+#endif
   } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU_KP
     int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
@@ -1076,6 +1054,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
                           static_cast<int>(total_length));
     push_gpups_timer.Pause();
+#endif
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index cf7d98db27e84..3df5a4b473861 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -181,35 +181,21 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
                                 float min_bound, float max_bound,
                                 float learning_rate, float initial_g2sum,
                                 float initial_range) {
-  cudaMemcpyToSymbol(optimizer_config::nonclk_coeff, &nonclk_coeff,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::clk_coeff, &clk_coeff, sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::min_bound, &min_bound, sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::max_bound, &max_bound, sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::learning_rate, &learning_rate,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::initial_g2sum, &initial_g2sum,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::initial_range, &initial_range,
-                     sizeof(float));
+  OptimizerConfig optimizer_config;
+  optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
+                                  learning_rate, initial_g2sum, initial_range);
+  HeterPs_->set_sparse_sgd(optimizer_config);
 }
 
 void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
                                 float mf_learning_rate, float mf_initial_g2sum,
                                 float mf_initial_range, float mf_min_bound,
                                 float mf_max_bound) {
-  cudaMemcpyToSymbol(optimizer_config::mf_create_thresholds,
-                     &mf_create_thresholds, sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_learning_rate, &mf_learning_rate,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_initial_g2sum, &mf_initial_g2sum,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_initial_range, &mf_initial_range,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_min_bound, &mf_min_bound,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_max_bound, &mf_max_bound,
-                     sizeof(float));
+  OptimizerConfig optimizer_config;
+  optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
+                                  mf_initial_g2sum, mf_initial_range,
+                                  mf_min_bound, mf_max_bound);
+  HeterPs_->set_embedx_sgd(optimizer_config);
 }
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
index 571a090b9b4a6..58b9f0f722f8c 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -84,7 +84,7 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
   }
 }
 
-__global__ void CopyKeysKernel(unsigned long long** src_keys,
+__global__ void CopyKeysKernel(unsigned long long* src_keys,
                                unsigned long long* dest_total_keys,
                                const long long* len, int slot_num,
                                int total_len) {
@@ -95,21 +95,27 @@ __global__ void CopyKeysKernel(unsigned long long** src_keys,
   }
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
-  __local__ int64_t local_len[slot_num];
-  GM2LM(len, local_len, slot_num * sizeof(int64_t));
+  __local__ long long local_len[slot_num];
+  GM2LM(len, local_len, slot_num * sizeof(long long));
+
+  __global_ptr__ unsigned long long* local_keys[slot_num];
+  GM2LM(src_keys, local_keys,
+        slot_num * sizeof(__global_ptr__ unsigned long long*));
 
   for (int i = thread_id; i < slot_num; i += nthreads) {
     // max core local memory = 8KB
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
-    int read_len = min(slot_len, 1024);
+    // int read_len = min(slot_len, 1024);
+    int read_len = 100;
     int dest_len = i ? local_len[i - 1] : 0;
-    __local__ uint64_t local_slot_keys[read_len];
+    __local__ unsigned long long local_slot_keys[read_len];
 
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(src_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
+      GM2LM(local_keys[i] + k, local_slot_keys,
+            real_read_len * sizeof(unsigned long long));
       LM2GM(local_slot_keys, dest_total_keys + dest_len + k,
-            real_read_len * sizeof(uint64_t));
+            real_read_len * sizeof(unsigned long long));
     }
   }
 }
@@ -199,7 +205,8 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
   stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
                ->x_context()
                ->xpu_stream;
-  unsigned long long** o_keys = (unsigned long long**)origin_keys;
+  unsigned long long* o_keys =
+      reinterpret_cast<unsigned long long*>(origin_keys);
   unsigned long long* t_keys = (unsigned long long*)total_keys;
   const long long* c_len = (const long long*)gpu_len;
   CopyKeysKernel<<<2, 64, stream>>>(o_keys, t_keys, c_len, slot_num, total_len);
@@ -256,13 +263,8 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
                                 float learning_rate, float initial_g2sum,
                                 float initial_range) {
   OptimizerConfig optimizer_config;
-  optimizer_config.nonclk_coeff = nonclk_coeff;
-  optimizer_config.clk_coeff = clk_coeff;
-  optimizer_config.min_bound = min_bound;
-  optimizer_config.max_bound = max_bound;
-  optimizer_config.learning_rate = learning_rate;
-  optimizer_config.initial_g2sum = initial_g2sum;
-  optimizer_config.initial_range = initial_range;
+  optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
+                                  learning_rate, initial_g2sum, initial_range);
   HeterPs_->set_sparse_sgd(optimizer_config);
 }
 
@@ -271,12 +273,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
                                 float mf_initial_range, float mf_min_bound,
                                 float mf_max_bound) {
   OptimizerConfig optimizer_config;
-  optimizer_config.mf_create_thresholds = mf_create_thresholds;
-  optimizer_config.mf_learning_rate = mf_learning_rate;
-  optimizer_config.mf_initial_g2sum = mf_initial_g2sum;
-  optimizer_config.mf_initial_range = mf_initial_range;
-  optimizer_config.mf_min_bound = mf_min_bound;
-  optimizer_config.mf_max_bound = mf_max_bound;
+  optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
+                                  mf_initial_g2sum, mf_initial_range,
+                                  mf_min_bound, mf_max_bound);
   HeterPs_->set_embedx_sgd(optimizer_config);
 }
 
diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc
index 13eb78874c395..d0d3c2fea3b56 100644
--- a/paddle/fluid/framework/heter_pipeline_trainer.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer.cc
@@ -282,7 +282,7 @@ void HeterPipelineTrainer::Run() {
   if (threads_.size() > 0) {
     threads_.clear();
   }
-  VLOG(3) << "Epoch Trainging done";
+  VLOG(3) << "Epoch Training done";
 }
 
 void HeterPipelineTrainer::Finalize() {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index a3b49476d820f..cdd703e679d95 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -95,6 +95,7 @@ pass_library(skip_layernorm_fuse_pass base)
 pass_library(multihead_matmul_fuse_pass inference)
 pass_library(adaptive_pool2d_convert_global_pass inference)
 pass_library(unsqueeze2_eltwise_fuse_pass inference)
+pass_library(yolo_box_fuse_pass inference)
 pass_library(layer_norm_fuse_pass inference)
 pass_library(add_support_int8_pass inference)
 pass_library(matmul_scale_fuse_pass inference)
@@ -107,6 +108,9 @@ if(WITH_TENSORRT)
     pass_library(trt_map_matmul_to_mul_pass inference)
     pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
     pass_library(preln_skip_layernorm_fuse_pass inference)
+    pass_library(set_transformer_input_convert_pass inference)
+    pass_library(remove_padding_recover_padding_pass inference)
+    pass_library(delete_remove_padding_recover_padding_pass inference)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
@@ -118,10 +122,12 @@ if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
     pass_library(mkldnn_inplace_pass inference DEPS mkldnn_placement_pass op_registry elementwise_add_op gelu_op activation_op softmax_op softmax DIR mkldnn)
     pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
+    pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
+    pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
     pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
     pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
@@ -208,6 +214,7 @@ if (WITH_MKLDNN)
     cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass)
     cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
+    cc_test(test_int8_scale_calculation_mkldnn_pass SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc DEPS int8_scale_calculation_mkldnn_pass pass_test_util)
     cc_test(test_fc_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util)
     cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util)
     cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util)
diff --git a/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
new file mode 100644
index 0000000000000..63233e0b584b2
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+void RecoverPadding::operator()() {
+  // Create nodes for recover_padding.
+  auto *recover_padding_input =
+      pattern->NewNode(recover_padding_input_repr())
+          ->assert_is_op_input("recover_padding", "Input");
+  auto *recover_padding_op = pattern->NewNode(recover_padding_op_repr())
+                                 ->assert_is_op("recover_padding");
+  auto *recover_padding_out =
+      pattern->NewNode(recover_padding_out_repr())
+          ->assert_is_op_output("recover_padding", "Out");
+
+  // Add links for recover_padding op.
+  recover_padding_op->LinksFrom({recover_padding_input})
+      .LinksTo({recover_padding_out});
+}
+}  // namespace patterns
+
+void DeleteRemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  int found_subgraph_count = 0;
+
+  //
+  GraphPatternDetector gpd;
+  patterns::RecoverPadding recover_padding(
+      gpd.mutable_pattern(), "delete_remove_padding_recover_padding_pass");
+  recover_padding();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    VLOG(3) << "delete_remove_padding_recover_padding_pass";
+
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_input, recover_padding_input,
+                              recover_padding);
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_op, recover_padding_op,
+                              recover_padding);
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_out, recover_padding_out,
+                              recover_padding);
+
+    std::unordered_set<const Node *> del_node_set;
+
+    bool delete_recover_padding = true;
+    for (size_t i = 0; i < recover_padding_out->outputs.size(); ++i) {
+      if (recover_padding_out->outputs[i]->Name() ==
+          "remove_padding") {  // op_node
+        auto *remove_padding_out_node =
+            recover_padding_out->outputs[i]->outputs[0];          // var_node
+        auto *out_op_node = remove_padding_out_node->outputs[0];  // op_node
+        IR_NODE_LINK_TO(recover_padding_input, out_op_node);
+        del_node_set.insert(recover_padding_out->outputs[i]);
+        del_node_set.insert(remove_padding_out_node);
+        out_op_node->Op()->RenameInput(remove_padding_out_node->Name(),
+                                       recover_padding_input->Name());
+        found_subgraph_count++;
+      } else {
+        delete_recover_padding = false;
+      }
+    }
+    if (delete_recover_padding) {
+      del_node_set.insert(recover_padding_op);
+      del_node_set.insert(recover_padding_out);
+    }
+    GraphSafeRemoveNodes(graph, del_node_set);
+  };
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(delete_remove_padding_recover_padding_pass,
+              paddle::framework::ir::DeleteRemovePaddingRecoverPaddingPass);
diff --git a/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h
new file mode 100644
index 0000000000000..3504b124c91d1
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+struct RecoverPadding : public PatternBase {
+  RecoverPadding(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "recover_padding") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(recover_padding_input);
+  PATTERN_DECL_NODE(recover_padding_op);
+  PATTERN_DECL_NODE(recover_padding_out);
+};
+}  // namespace patterns
+
+class DeleteRemovePaddingRecoverPaddingPass : public FusePassBase {
+ public:
+  DeleteRemovePaddingRecoverPaddingPass() {}
+  virtual ~DeleteRemovePaddingRecoverPaddingPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"delete_remove_padding_recover_padding_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 48df5869a7a1f..40e1de8a523aa 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -172,7 +172,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
       VLOG(6) << "The number of new gradients is " << new_grad_idx.size();
       if (new_grad_idx.size() == 1) return;
       // NOTE(zcd): If the gradients of backward stage and optimization stage
-      // have diff, Only take care of the the gradient of optimization stage.
+      // have diff, Only take care of the gradient of optimization stage.
       GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_map);
     }
   }
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index 921cf0904f632..2b7a3e1899c76 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -127,7 +127,7 @@ void OperationMap::InsertUnaryElementwiseOperations() {
 
   // scale
   //  out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
-  //  here we use '=' operator to seperate th default value
+  //  here we use '=' operator to separate th default value
   // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
   //  bias.
   insert_handler(
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index fbd8fda131b6d..8c8d9fdddec85 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -720,7 +720,7 @@ bool HasOutput(Node *op, const std::string &argument) {
   PADDLE_ENFORCE_EQ(
       op->IsOp(), true,
       platform::errors::InvalidArgument(
-          "First parameter of function HasOuput must be Node::Op"));
+          "First parameter of function HasOutput must be Node::Op"));
   auto const &names = op->Op()->OutputNames();
   if (std::find(names.begin(), names.end(), argument) == names.end())
     return false;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d7e265fe28bf9..9e5a82fc44586 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -81,6 +81,7 @@ struct PDNode {
   bool IsVar() const { return type_ == Type::kVar; }
 
   const std::string& name() const { return name_; }
+  const PDPattern* pdpattern() const { return pattern_; }
 
   PDNode& operator=(const PDNode&) = delete;
   PDNode(const PDNode&) = delete;
@@ -277,7 +278,44 @@ class PDPattern {
  */
 class GraphPatternDetector {
  public:
-  using subgraph_t = std::map<PDNode*, Node*>;
+  struct NodeIdCompare {
+    bool operator()(Node* node1, Node* node2) const {
+      return node1->id() < node2->id();
+    }
+  };
+
+  struct PDNodeCompare {
+    bool operator()(const PDNode* node1, const PDNode* node2) const {
+      auto& nodes1 = node1->pdpattern()->nodes();
+      auto& nodes2 = node2->pdpattern()->nodes();
+      if (nodes1.size() != nodes2.size()) {
+        return nodes1.size() < nodes2.size();
+      } else {
+        std::string pdnode_hash_key1 = "";
+        std::string pdnode_hash_key2 = "";
+        for (auto& node : nodes1) {
+          pdnode_hash_key1 += node.get()->name();
+          pdnode_hash_key1 += "#";
+        }
+        pdnode_hash_key1 += node1->name();
+        for (auto& node : nodes2) {
+          pdnode_hash_key2 += node.get()->name();
+          pdnode_hash_key2 += "#";
+        }
+        pdnode_hash_key2 += node2->name();
+
+        auto pdnode_key1 =
+            std::to_string(std::hash<std::string>()(pdnode_hash_key1));
+        auto pdnode_key2 =
+            std::to_string(std::hash<std::string>()(pdnode_hash_key2));
+
+        return pdnode_key1 < pdnode_key2;
+      }
+      return false;
+    }
+  };
+
+  using subgraph_t = std::map<PDNode*, Node*, PDNodeCompare>;
 
   // Operate on the detected pattern.
   using handle_t =
@@ -321,7 +359,8 @@ class GraphPatternDetector {
   using hit_rcd_t =
       std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
   PDPattern pattern_;
-  std::map<const PDNode*, std::set<Node*>> pdnodes2nodes_;
+  std::map<const PDNode*, std::set<Node*, NodeIdCompare>, PDNodeCompare>
+      pdnodes2nodes_;
 };
 
 // some helper methods.
@@ -1018,7 +1057,7 @@ struct Pool : public PatternBase {
 
 // Elementwise ops
 // Forward pass for element-wise operators (add, mul)
-// elementwise_mul_out is the result of the operator
+// elementwise_out is the result of the operator
 struct Elementwise : public PatternBase {
   Elementwise(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "elementwise") {}
@@ -1393,7 +1432,7 @@ struct PriorBox : public PatternBase {
 };
 
 // Conv + ElementwiseAdd + an activation
-// This pattern can futher fuse the conv related ops after the conv+bn fusion.
+// This pattern can further fuse the conv related ops after the conv+bn fusion.
 struct ConvElementwiseaddAct : public PatternBase {
   ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {}
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 7c517a50e9af4..84a14200cb7a5 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
 
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
@@ -68,7 +69,7 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
   std::vector<float> weight_decay_values{};
 
   // use map store <op_type, op_ptr> ?
-  for (auto* node : graph->Nodes()) {
+  for (auto* node : TopologySortOperations(*graph)) {
     if (!node->IsOp()) {
       continue;
     }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 9fc6de3c8c172..313b2cc33459e 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -66,7 +66,7 @@ static void ShareVarInfoToCinnLaunch(
           << paddle::string::join_strings(vars_to_delete, ',');
 
   const Graph& subgraph = paddle2cinn::CinnCompiler::GetInstance()->FindGraph(
-      cinn_launch_op->GetOp()->Attr<std::string>(operators::kCompilationKey));
+      cinn_launch_op->GetOp()->Attr<int64_t>(operators::kCompilationKey));
   auto& dst_varinfo_map =
       subgraph.Get<Name2VarInfoMap>(paddle2cinn::kMemOptVarInfoFromMainGraph);
   const Name2VarInfoMap& src_varinfo_map =
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
index 60f4e4b309c5d..88bf9e3876399 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
@@ -51,8 +51,7 @@ static ProgramDesc BuildProgramInsideCinnLaunchOp() {
   return program;
 }
 
-static ProgramDesc BuildProgramWithCinnLaunchOp(
-    const std::string& compilation_key) {
+static ProgramDesc BuildProgramWithCinnLaunchOp(int64_t compilation_key) {
   // create a cinn_launch op
   ProgramDesc program;
   auto* block = program.MutableBlock(0);
@@ -89,7 +88,7 @@ TEST(ShareMemInfoToSubGraphPassTest, test_main_graph_share_varinfo) {
   auto subgraph = std::make_unique<ir::Graph>(BuildProgramInsideCinnLaunchOp());
   subgraph->GetOrInit<Name2VarInfoMap>(
       paddle2cinn::kMemOptVarInfoFromMainGraph);
-  std::string compilation_key =
+  auto compilation_key =
       paddle2cinn::CinnCompiler::GetInstance()->AddGraph(std::move(subgraph));
 
   // build test data and apply pass
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000..50e751e02dfa0
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
@@ -0,0 +1,272 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h"
+
+#include <cmath>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node;
+
+#define GET_CONV_BN_NODES(pattern_name)                                    \
+  /* OPERATORS */                                                          \
+  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
+  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
+  /* CONV inputs */                                                        \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
+  /* CONV outputs */                                                       \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
+  /* Affine Channel inputs */                                              \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
+  /* Affine channel outputs */                                             \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
+
+void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
+                                const ir::Node& ac_scale,
+                                const LoDTensor& ac_bias_tensor,
+                                LoDTensor* eltwise_y_in_tensor) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  // Re-compute bias of conv2d from AffineChannel
+  PADDLE_ENFORCE_EQ(
+      eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
+      platform::errors::InvalidArgument(
+          "Tensor elementwise y(%d) and activation bias(%d) must have same "
+          "dimension.",
+          eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
+
+  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
+
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
+                                       scale_tensor->numel(), 1);
+  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
+                                         ac_bias_tensor.numel(), 1);
+
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
+
+  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
+
+  // Re-compute weight of conv2d from AffineChannel
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
+  auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
+  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
+
+  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
+                                       weights_shape_2d[1]);
+
+  weights_array_2d.colwise() *= scale_array;
+
+  // Check for subnormal values that slows down convolution execution
+  for (int i = 0; i < weights->numel(); ++i) {
+    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
+  }
+}
+
+ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
+  AddOpCompat(OpCompat("conv2d"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("Filter")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddAttr("strides")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("paddings")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("padding_algorithm")
+      .IsOptional()
+      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
+      .End()
+      .AddAttr("groups")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("dilations")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("data_format")
+      .IsStringIn({"NCHW", "AnyLayout"})
+      .End();
+
+  AddOpCompat(OpCompat("affine_channel"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("data_layout")
+      .IsStringIn({"NCHW", "AnyLayout"})
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsNumEQ(1)
+      .End();
+}
+
+void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init(name_scope_, graph);
+
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
+
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
+                                              name_scope_);
+  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
+
+  int found_conv_ac_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle ConvAffineChannel fuse";
+
+    GET_CONV_BN_NODES(conv_ac_pattern);
+
+    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
+    if (data_format == "AnyLayout") {
+      LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
+                                 "it's wrong if data_format of conv is not "
+                                 "NCHW.";
+    }
+
+    // Get affine_channel bias for resizing eltwise_y!
+    auto* ac_bias_tensor =
+        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
+
+    // Create eltwise_y (conv bias) variable
+    VarDesc eltwise_y_in_desc(
+        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
+    // Set shape && datatype manually
+    eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
+    eltwise_y_in_desc.SetDataType(
+        framework::TransToProtoVarType(ac_bias_tensor->dtype()));
+    eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
+    eltwise_y_in_desc.SetPersistable(true);
+
+    // Initialize eltwise_y
+    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
+    auto* eltwise_y_in_tensor =
+        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
+    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
+    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+                eltwise_y_in_tensor->numel(), 0.0f);
+
+    // update weights and biases
+    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
+                               eltwise_y_in_tensor);
+
+    // create an elementwise add node.
+    OpDesc desc;
+    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
+    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
+    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
+    desc.SetType("elementwise_add");
+    desc.SetAttr("axis", 1);
+    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
+
+    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
+
+    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
+
+    IR_NODE_LINK_TO(conv_out, eltwise_op);
+    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
+    IR_NODE_LINK_TO(eltwise_op, ac_out);
+    found_conv_ac_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_conv_ac_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_affine_channel_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvAffineChannelFusePass);
+
+REGISTER_PASS_CAPABILITY(conv_affine_channel_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("conv2d", 1)
+            .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000..075b6d7220316
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the Conv and ConvAffineChannel.
+ */
+class Graph;
+
+class ConvAffineChannelFusePass : public FusePassBase {
+ public:
+  ConvAffineChannelFusePass();
+  virtual ~ConvAffineChannelFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph*) const override;
+  const std::string name_scope_{"conv_affine_channel_mkldnn_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index 62b2be712beef..eebc87f5d9988 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -186,9 +186,22 @@ class DeQuantizer final : public Quanter {
   // Checking whether a reorder from BF16 to FP32
   // should be added after the output to the operator
   bool IsNotPermittedName(const std::string& output_name) const override {
-    // XShape is output in transpose2 and reshape2 operators used to store the
-    // shape and lod of X. So this output do not need dequantize before.
-    return (output_name == "XShape");
+    std::unordered_map<std::string, std::vector<std::string>> block_list{
+        {"layer_norm",
+         {"Mean", "Variance"}}};  // not used in inference in MKLDNN
+
+    std::vector<std::string> blocked_outputs{"XShape"};  // blocklist for any op
+    auto op_name = op->Name();
+    if (block_list.count(op_name)) {
+      const auto& op_blocklist = block_list[op_name];
+      blocked_outputs.insert(blocked_outputs.begin(), op_blocklist.begin(),
+                             op_blocklist.end());
+    }
+
+    return std::any_of(blocked_outputs.begin(), blocked_outputs.end(),
+                       [&output_name](const std::string& name) {
+                         return name == output_name;
+                       });
   }
 
   std::string get_op_type() const override { return "dequantize"; };
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
index 877ee71fc2d85..3f5e9a1484841 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@@ -65,22 +65,20 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
 static const std::initializer_list<std::string> variable_names{
     "z", "a", "b", "c", "d", "e", "f", "g", "h", "i"};
 
-void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
-                 const std::initializer_list<std::string> variable_names,
-                 int* original_nodes_num, int* current_nodes_num) {
+void PreparePass(std::unique_ptr<ir::Graph>& graph, int* original_nodes_num,
+                 int* current_nodes_num) {
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_pass");
 
-  *original_nodes_num = (*graph)->Nodes().size();
-  (*graph).reset(pass->Apply((*graph).release()));
-  *current_nodes_num = (*graph)->Nodes().size();
+  *original_nodes_num = graph->Nodes().size();
+  graph.reset(pass->Apply(graph.release()));
+  *current_nodes_num = graph->Nodes().size();
 }
 
 void MainTest(const ProgramDesc& prog, const int& quant_count,
               const int& dequant_count, const int& added_nodes_count) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto graph = std::make_unique<ir::Graph>(prog);
   int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names, &original_nodes_num,
-              &current_nodes_num);
+  PreparePass(graph, &original_nodes_num, &current_nodes_num);
 
   int quantize_nodes_count = 0;
   int dequantize_nodes_count = 0;
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
new file mode 100644
index 0000000000000..678a8fb4a6955
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+Int8ScaleCalculationMkldnnPass::Int8ScaleCalculationMkldnnPass() {
+  AddOpCompat(OpCompat("conv2d"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("Filter")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddAttr("strides")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("paddings")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("padding_algorithm")
+      .IsOptional()
+      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
+      .End()
+      .AddAttr("groups")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("dilations")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("data_format")
+      .IsStringIn({"NCHW", "AnyLayout"})
+      .End();
+}
+
+void Int8ScaleCalculationMkldnnPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::InvalidArgument(
+                              "Pointer to graph argument should not be NULL."));
+  FusePassBase::Init("int8_scale_calculation_mkldnn_pass", graph);
+  GraphPatternDetector gpd;
+  patterns::Conv conv_pattern(gpd.mutable_pattern(),
+                              "int8_scale_calculation_mkldnn_pass");
+  conv_pattern();
+
+  int found_int8_scales_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "Pass in op compat failed.";
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+
+    if (!platform::HasOpINT8DataType(conv_op->Op()) ||
+        conv_op->Op()->HasAttr("Sum_scale")) {
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    auto input_names = conv_op->Op()->InputNames();
+    bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end();
+    std::vector<int64_t> weights_tz = conv_filter->Var()->GetShape();
+    const int groups =
+        std::max(conv_op->Op()->GetAttrIfExists<int>("groups"), 1);
+
+    const auto& scale_weights_data =
+        conv_op->Op()->GetAttrIfExists<std::vector<float>>("Scale_weights");
+    const auto& scale_in_data =
+        conv_op->Op()->GetAttrIfExists<float>("Scale_in");
+
+    bool is_multi_channel = scale_weights_data.size() > 1;
+
+    int count = 1;
+    if (is_multi_channel) {
+      count *= weights_tz[0];
+      if (groups > 1) {
+        count *= weights_tz[1];
+      }
+    }
+
+    if (has_bias && conv_op->Op()->Input("Bias").size() > 0) {
+      auto bias_scales = std::vector<float>(count);
+      for (int i = 0; i < count; i++) {
+        bias_scales[i] = scale_in_data * scale_weights_data[i];
+      }
+      conv_op->Op()->SetAttr("Bias_scales", bias_scales);
+    }
+
+    const bool& force_fp32_output =
+        conv_op->Op()->GetAttrIfExists<bool>("force_fp32_output");
+    const bool& fuse_residual_conn =
+        conv_op->Op()->GetAttrIfExists<bool>("fuse_residual_connection");
+    const auto& scale_in_eltwise_data =
+        conv_op->Op()->GetAttrIfExists<float>("Scale_in_eltwise");
+    bool has_activation =
+        !conv_op->Op()->GetAttrIfExists<std::string>("fuse_activation").empty();
+    float activation_scale =
+        force_fp32_output
+            ? 1.0f
+            : has_activation
+                  ? conv_op->Op()->GetAttrIfExists<float>("Scale_out")
+                  : 1.0f;
+    auto scale_out_data =
+        force_fp32_output
+            ? 1.0f
+            : has_activation
+                  ? 1.0f
+                  : conv_op->Op()->GetAttrIfExists<float>("Scale_out");
+    float sum_scale =
+        fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
+
+    std::vector<float> output_shift_scale(count);
+
+#pragma omp parallel for if (count > 50)
+    for (int i = 0; i < count; i++) {
+      if (scale_weights_data[i] == 0.0)
+        // weights data will contain 0 in some models, then weights
+        // scale couldn't be calculated
+        output_shift_scale[i] = scale_out_data;
+      else
+        output_shift_scale[i] =
+            static_cast<float>(static_cast<double>(scale_out_data) /
+                               (static_cast<double>(scale_in_data) *
+                                static_cast<double>(scale_weights_data[i])));
+    }
+
+    conv_op->Op()->SetAttr("Sum_scale", sum_scale);
+    conv_op->Op()->SetAttr("Output_shift_scale", output_shift_scale);
+    conv_op->Op()->SetAttr("Activation_scale", activation_scale);
+    found_int8_scales_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_int8_scales_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(int8_scale_calculation_mkldnn_pass,
+              paddle::framework::ir::Int8ScaleCalculationMkldnnPass);
+REGISTER_PASS_CAPABILITY(int8_scale_calculation_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().LE(
+            "conv2d", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
new file mode 100644
index 0000000000000..9233650a2db3c
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// #include <memory>
+// #include <string>
+// #include <unordered_map>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+/*
+ * compute quantization scales for biases and weights
+ */
+class Int8ScaleCalculationMkldnnPass : public FusePassBase {
+ public:
+  Int8ScaleCalculationMkldnnPass();
+  virtual ~Int8ScaleCalculationMkldnnPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
new file mode 100644
index 0000000000000..804d04e35f690
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           std::vector<float> scale_weights = {1.5f}) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("name", name);
+    op->SetAttr("strides", std::vector<int>({1, 1}));
+    op->SetAttr("groups", 1);
+    op->SetAttr("paddings", std::vector<int>({0, 0}));
+    op->SetAttr("padding_algorithm", std::string("EXPLICIT"));
+    op->SetAttr("dilations", std::vector<int>({1, 1}));
+    op->SetAttr("data_format", std::string("NCHW"));
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+
+    op->SetOutput("Output", outputs);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", scale_weights);
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("mkldnn_data_type", std::string("int8"));
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+}
+
+ProgramDesc BuildProgramDesc(bool convWithExistingBias,
+                             std::vector<float> scale_weights = {1.5}) {
+  ProgramDesc prog;
+  std::vector<std::string> nodes{"c", "weights", "f"};
+  if (convWithExistingBias) nodes.push_back("conv_bias");
+  for (auto& v : nodes) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    if (v == "weights") {
+      var->SetPersistable(true);
+      var->SetShape({1, static_cast<int>(scale_weights.size()), 1, 1});
+    }
+  }
+
+  if (convWithExistingBias) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}), scale_weights);
+  } else if (scale_weights.size() > 1) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}), scale_weights);
+  } else {
+    SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
+          std::vector<std::string>({"f"}));
+  }
+
+  return prog;
+}
+
+void MainTest(bool convWithExistingBias, int removed_nodes_count, float scale,
+              std::vector<float> scale_weights = {1.5f}) {
+  auto prog = BuildProgramDesc(convWithExistingBias, scale_weights);
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass =
+      PassRegistry::Instance().Get("int8_scale_calculation_mkldnn_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph.reset(pass->Apply(graph.release()));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_EQ(original_nodes_num, current_nodes_num);
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+
+      EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Scale_weights"),
+                scale_weights);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Scale_in"), scale);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Scale_out"), scale);
+
+      EXPECT_EQ(op->GetAttrIfExists<float>("Sum_scale"), scale);
+      EXPECT_EQ(
+          op->GetAttrIfExists<std::vector<float>>("Output_shift_scale")[0],
+          scale / scale_weights[0]);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Activation_scale"), scale);
+
+      if (convWithExistingBias) {
+        EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Bias_scales")[0],
+                  scale * scale_weights[0]);
+      }
+    }
+  }
+  EXPECT_EQ(original_nodes_num - removed_nodes_count, current_nodes_num);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass, int8_scale_calculation_with_no_bias) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  auto scale_weights = {1.5f};
+  MainTest(false, removed_nodes_count, scale, scale_weights);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass, int8_scale_calculation_with_bias) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  auto scale_weights = {1.5f};
+  MainTest(true, removed_nodes_count, scale, scale_weights);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass,
+     int8_scale_calculation_with_bias_scale_weights) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  std::vector<float> scale_weights = {1.5f, 2.3f};
+  MainTest(true, removed_nodes_count, scale, scale_weights);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(int8_scale_calculation_mkldnn_pass);
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
new file mode 100644
index 0000000000000..67dfe074dc075
--- /dev/null
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
@@ -0,0 +1,298 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+void SkipLayernorm::operator()() {
+  // Create nodes for skip_layernorm.
+  auto* skip_layernorm_x = pattern->NewNode(skip_layernorm_x_repr())
+                               ->assert_is_op_input("skip_layernorm", "X");
+  auto* skip_layernorm_y = pattern->NewNode(skip_layernorm_y_repr())
+                               ->assert_is_op_input("skip_layernorm", "Y");
+  auto* skip_layernorm_op = pattern->NewNode(skip_layernorm_op_repr())
+                                ->assert_is_op("skip_layernorm");
+  auto* skip_layernorm_out = pattern->NewNode(skip_layernorm_out_repr())
+                                 ->assert_is_op_output("skip_layernorm", "Out");
+
+  // Add links for skip_layernorm op.
+  skip_layernorm_op->LinksFrom({skip_layernorm_x, skip_layernorm_y})
+      .LinksTo({skip_layernorm_out});
+}
+
+void MultiheadMatmul::operator()() {
+  // Create nodes for multihead_matmul.
+  auto* multihead_matmul_input =
+      pattern->NewNode(multihead_matmul_input_repr())
+          ->assert_is_op_input("multihead_matmul", "Input");
+  auto* multihead_matmul_op = pattern->NewNode(multihead_matmul_op_repr())
+                                  ->assert_is_op("multihead_matmul");
+  auto* multihead_matmul_out =
+      pattern->NewNode(multihead_matmul_out_repr())
+          ->assert_is_op_output("multihead_matmul", "Out");
+
+  // Add links for multihead_matmul op.
+  multihead_matmul_op->LinksFrom({multihead_matmul_input})
+      .LinksTo({multihead_matmul_out});
+}
+
+void Fc::operator()() {
+  // Create nodes for fc.
+  auto* fc_input =
+      pattern->NewNode(fc_input_repr())->assert_is_op_input("fc", "Input");
+  auto* fc_op = pattern->NewNode(fc_op_repr())->assert_is_op("fc");
+  auto* fc_out =
+      pattern->NewNode(fc_out_repr())->assert_is_op_output("fc", "Out");
+
+  // Add links for fc op.
+  fc_op->LinksFrom({fc_input}).LinksTo({fc_out});
+}
+
+void Activation::operator()() {
+  // Create nodes for activation.
+  std::unordered_set<std::string> activation_ops{"relu", "sigmoid", "tanh"};
+  auto* activation_input = pattern->NewNode(activation_input_repr())
+                               ->assert_is_ops_input(activation_ops);
+  auto* activation_op =
+      pattern->NewNode(activation_op_repr())->assert_is_ops(activation_ops);
+  auto* activation_out = pattern->NewNode(activation_out_repr())
+                             ->assert_is_ops_output(activation_ops);
+
+  // Add links for activation op.
+  activation_op->LinksFrom({activation_input}).LinksTo({activation_out});
+}
+}  // namespace patterns
+
+void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  int found_subgraph_count = 0;
+
+  // Create an remove_padding op node
+  auto insert_remove_padding_op = [&](Node* input_node, Node* op_node) {
+    // create op, var in graph
+    OpDesc remove_padding;
+    std::string remove_padding_out_name =
+        input_node->Name() + ".remove_padding";
+
+    VarDesc remove_padding_out(remove_padding_out_name);
+    remove_padding_out.SetDataType(input_node->Var()->GetDataType());
+    remove_padding_out.SetShape(input_node->Var()->GetShape());
+    remove_padding_out.SetPersistable(false);
+
+    // remove_padding_op
+    remove_padding.SetType("remove_padding");
+
+    // input
+    remove_padding.SetInput("Input", {input_node->Name()});
+
+    // output
+    remove_padding.SetOutput("Out", {remove_padding_out_name});
+
+    auto remove_padding_op_node = graph->CreateOpNode(&remove_padding);
+    auto remove_padding_out_node = graph->CreateVarNode(&remove_padding_out);
+
+    // replace link
+    for (size_t i = 0; i < input_node->outputs.size(); ++i) {
+      if (input_node->outputs[i] == op_node) {
+        input_node->outputs[i] = remove_padding_op_node;
+        remove_padding_op_node->inputs.push_back(input_node);
+      }
+    }
+
+    // link node
+    IR_NODE_LINK_TO(remove_padding_op_node, remove_padding_out_node);
+
+    // replace link
+    for (size_t i = 0; i < op_node->inputs.size(); ++i) {
+      if (op_node->inputs[i] == input_node) {
+        op_node->inputs[i] = remove_padding_out_node;
+        remove_padding_out_node->outputs.push_back(op_node);
+      }
+    }
+
+    // create variable in scope
+    scope->Var(remove_padding_out_name);
+    auto* remove_padding_out_tensor =
+        scope->FindVar(remove_padding_out_name)->GetMutable<LoDTensor>();
+    remove_padding_out_tensor->mutable_data<float>(platform::CUDAPlace());
+
+    // rename
+    op_node->Op()->RenameInput(input_node->Name(),
+                               remove_padding_out_node->Name());
+  };
+
+  // create an remove_padding op node
+  auto insert_recover_padding_op = [&](Node* op_node, Node* out_node) {
+    // create op, var in graph
+    OpDesc recover_padding;
+    std::string recover_padding_input_name =
+        out_node->Name() + ".recover_padding";
+    VarDesc recover_padding_input(recover_padding_input_name);
+    recover_padding_input.SetDataType(out_node->Var()->GetDataType());
+    recover_padding_input.SetShape(out_node->Var()->GetShape());
+    recover_padding_input.SetPersistable(false);
+
+    // recover_padding_op
+    recover_padding.SetType("recover_padding");
+
+    // input
+    recover_padding.SetInput("Input", {recover_padding_input_name});
+
+    // output
+    recover_padding.SetOutput("Out", {out_node->Name()});
+
+    auto recover_padding_op_node = graph->CreateOpNode(&recover_padding);
+    auto recover_padding_input_node =
+        graph->CreateVarNode(&recover_padding_input);
+
+    // replace link
+    for (size_t i = 0; i < op_node->outputs.size(); ++i) {
+      if (op_node->outputs[i] == out_node) {
+        op_node->outputs[i] = recover_padding_input_node;
+        recover_padding_input_node->inputs.push_back(op_node);
+      }
+    }
+
+    // link node
+    IR_NODE_LINK_TO(recover_padding_input_node, recover_padding_op_node);
+
+    // replace link
+    for (size_t i = 0; i < out_node->inputs.size(); ++i) {
+      if (out_node->inputs[i] == op_node) {
+        out_node->inputs[i] = recover_padding_op_node;
+        recover_padding_op_node->outputs.push_back(out_node);
+      }
+    }
+
+    // create variable in scope
+    scope->Var(recover_padding_input_name);
+    auto* recover_padding_input_tensor =
+        scope->FindVar(recover_padding_input_name)->GetMutable<LoDTensor>();
+    recover_padding_input_tensor->mutable_data<float>(platform::CUDAPlace());
+
+    // rename
+    op_node->Op()->RenameOutput(out_node->Name(), recover_padding_input_name);
+  };
+
+  GraphPatternDetector gpd1;
+  patterns::SkipLayernorm skip_layernorm(gpd1.mutable_pattern(),
+                                         "remove_padding_recover_padding_pass");
+  skip_layernorm();
+
+  auto handler1 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "skip_layernorm";
+
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_x, skip_layernorm_x,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_y, skip_layernorm_y,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_op, skip_layernorm_op,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_out, skip_layernorm_out,
+                              skip_layernorm);
+
+    insert_remove_padding_op(skip_layernorm_x, skip_layernorm_op);
+    insert_remove_padding_op(skip_layernorm_y, skip_layernorm_op);
+    insert_recover_padding_op(skip_layernorm_op, skip_layernorm_out);
+
+    found_subgraph_count++;
+  };
+  gpd1(graph, handler1);
+
+  GraphPatternDetector gpd2;
+  patterns::MultiheadMatmul multihead_matmul(
+      gpd2.mutable_pattern(), "remove_padding_recover_padding_pass");
+  multihead_matmul();
+
+  auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "multihead_matmul";
+
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_input, multihead_matmul_input,
+                              multihead_matmul);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_op, multihead_matmul_op,
+                              multihead_matmul);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_out, multihead_matmul_out,
+                              multihead_matmul);
+
+    insert_remove_padding_op(multihead_matmul_input, multihead_matmul_op);
+    insert_recover_padding_op(multihead_matmul_op, multihead_matmul_out);
+
+    found_subgraph_count++;
+  };
+  gpd2(graph, handler2);
+
+  GraphPatternDetector gpd3;
+  patterns::Fc fc(gpd3.mutable_pattern(),
+                  "remove_padding_recover_padding_pass");
+  fc();
+
+  auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: fc";
+
+    GET_IR_NODE_FROM_SUBGRAPH(fc_input, fc_input, fc);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fc);
+
+    insert_remove_padding_op(fc_input, fc_op);
+    insert_recover_padding_op(fc_op, fc_out);
+
+    found_subgraph_count++;
+  };
+  gpd3(graph, handler3);
+
+  GraphPatternDetector gpd4;
+  patterns::Activation activation(gpd4.mutable_pattern(),
+                                  "remove_padding_recover_padding_pass");
+  activation();
+
+  auto handler4 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3)
+        << "remove_padding_recover_padding_pass for transformer: activation";
+
+    GET_IR_NODE_FROM_SUBGRAPH(activation_input, activation_input, activation);
+    GET_IR_NODE_FROM_SUBGRAPH(activation_op, activation_op, activation);
+    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, activation);
+
+    insert_remove_padding_op(activation_input, activation_op);
+    insert_recover_padding_op(activation_op, activation_out);
+
+    found_subgraph_count++;
+  };
+  gpd4(graph, handler4);
+
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(remove_padding_recover_padding_pass,
+              paddle::framework::ir::RemovePaddingRecoverPaddingPass);
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
new file mode 100644
index 0000000000000..d7ccfc75c2000
--- /dev/null
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct SkipLayernorm : public PatternBase {
+  SkipLayernorm(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "skip_layernorm") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(skip_layernorm_x);
+  PATTERN_DECL_NODE(skip_layernorm_y);
+  PATTERN_DECL_NODE(skip_layernorm_op);
+  PATTERN_DECL_NODE(skip_layernorm_out);
+};
+
+struct MultiheadMatmul : public PatternBase {
+  MultiheadMatmul(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(multihead_matmul_input);
+  PATTERN_DECL_NODE(multihead_matmul_op);
+  PATTERN_DECL_NODE(multihead_matmul_out);
+};
+
+struct Fc : public PatternBase {
+  Fc(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "fc") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(fc_input);
+  PATTERN_DECL_NODE(fc_op);
+  PATTERN_DECL_NODE(fc_out);
+};
+
+struct Activation : public PatternBase {
+  Activation(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "activation") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(activation_input);
+  PATTERN_DECL_NODE(activation_op);
+  PATTERN_DECL_NODE(activation_out);
+};
+}  // namespace patterns
+
+class RemovePaddingRecoverPaddingPass : public FusePassBase {
+ public:
+  RemovePaddingRecoverPaddingPass() {}
+  virtual ~RemovePaddingRecoverPaddingPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"remove_padding_recover_padding_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
new file mode 100644
index 0000000000000..37e77bc134d3c
--- /dev/null
+++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/set_transformer_input_convert_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+SetTransformerInputConvertPass::SetTransformerInputConvertPass() {
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+}
+namespace patterns {
+
+void SetTransformerInputConvert::operator()() {
+  std::unordered_set<std::string> lookup_table_ops{"lookup_table",
+                                                   "lookup_table_v2"};
+  // Create nodes for lookup_table1 op.
+  auto *lookup_table1_x = pattern->NewNode(lookup_table1_x_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "Ids");
+  auto *lookup_table1_w = pattern->NewNode(lookup_table1_w_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "W");
+  auto *lookup_table1_op =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(lookup_table_ops);
+  auto *lookup_table1_out = pattern->NewNode(lookup_table1_out_repr())
+                                ->assert_is_ops_output(lookup_table_ops)
+                                ->AsIntermediate()
+                                ->assert_is_op_input("elementwise_add", "X");
+
+  // Create nodes for lookup_table2 op.
+  auto *lookup_table2_x = pattern->NewNode(lookup_table2_x_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "Ids");
+  auto *lookup_table2_w = pattern->NewNode(lookup_table2_w_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "W");
+  auto *lookup_table2_op =
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(lookup_table_ops);
+  auto *lookup_table2_out = pattern->NewNode(lookup_table2_out_repr())
+                                ->assert_is_ops_output(lookup_table_ops)
+                                ->AsIntermediate()
+                                ->assert_is_op_input("elementwise_add", "Y");
+
+  // Create nodes for elementwise_add op.
+  auto *elementwise_op =
+      pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
+  auto *elementwise_out = pattern->NewNode(elementwise_out_repr())
+                              ->AsOutput()
+                              ->assert_is_only_output_of_op("elementwise_add");
+
+  // links nodes.
+  lookup_table1_op->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  lookup_table2_op->LinksFrom({lookup_table2_x, lookup_table2_w})
+      .LinksTo({lookup_table2_out});
+  elementwise_op->LinksFrom({lookup_table1_out, lookup_table2_out})
+      .LinksTo({elementwise_out});
+}
+
+}  // namespace patterns
+
+void SetTransformerInputConvertPass::ApplyImpl(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  int found_subgraph_count = 0;
+
+  GraphPatternDetector gpd;
+  patterns::SetTransformerInputConvert fused_pattern(
+      gpd.mutable_pattern(), "transformer_input_convert_pass");
+  fused_pattern();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "transformer_input_convert_pass in op compat failed.";
+      return;
+    }
+
+    VLOG(3) << "transformer_input_convert_pass for pos_id, max_seqlen";
+
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, fused_pattern);
+
+    // create op, var in graph
+    OpDesc new_desc;
+    new_desc.SetType("transformer_input_convert");
+
+    // inputs
+    new_desc.SetInput("X", {lookup_table2_x->Name()});
+
+    // outputs
+    std::vector<std::string> output_0 = {"pos_id_tensor"};
+    std::vector<std::string> output_1 = {"max_seqlen_tensor"};
+    new_desc.SetOutput("PosId", output_0);
+    new_desc.SetOutput("MaxSeqlen", output_1);
+
+    std::string transformer_input_convert_out0_name = "pos_id_tensor";
+    std::string transformer_input_convert_out1_name = "max_seqlen_tensor";
+    VarDesc transformer_input_convert_out0(transformer_input_convert_out0_name);
+    VarDesc transformer_input_convert_out1(transformer_input_convert_out1_name);
+    transformer_input_convert_out0.SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out1.SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out0.SetShape({-1});
+    transformer_input_convert_out1.SetShape({-1});
+    transformer_input_convert_out0.SetPersistable(false);
+    transformer_input_convert_out1.SetPersistable(false);
+
+    auto new_op_node = graph->CreateOpNode(&new_desc);
+    auto transformer_input_convert_out0_node =
+        graph->CreateVarNode(&transformer_input_convert_out0);
+    auto transformer_input_convert_out1_node =
+        graph->CreateVarNode(&transformer_input_convert_out1);
+
+    // needn't create variable in scope
+
+    IR_NODE_LINK_TO(lookup_table2_x, new_op_node);
+    IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out0_node);
+    IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out1_node);
+
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(set_transformer_input_convert_pass,
+              paddle::framework::ir::SetTransformerInputConvertPass);
+REGISTER_PASS_CAPABILITY(set_transformer_input_convert_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("lookup_table", 1)
+            .LE("lookup_table_v2", 1)
+            .LE("elementweise_add", 1));
diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
new file mode 100644
index 0000000000000..5a5843e810f9a
--- /dev/null
+++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+//     in_var  emb       in_var   emb
+//       |      |          |       |
+//     lookup_table      lookup_table
+//           |                 |
+//        lkt_var           lkt_var
+//            \                /
+//             elementwise_add
+//                    |
+//               elt_out_var
+//
+struct SetTransformerInputConvert : public PatternBase {
+  SetTransformerInputConvert(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "transformer_input_convert") {}
+
+  void operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table2);
+  PATTERN_DECL_NODE(elementwise);
+
+  // declare variable node's name
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(lookup_table2_x);
+  PATTERN_DECL_NODE(lookup_table2_w);
+  PATTERN_DECL_NODE(lookup_table2_out);
+  PATTERN_DECL_NODE(elementwise_out);
+};
+}  // namespace patterns
+
+class SetTransformerInputConvertPass : public FusePassBase {
+ public:
+  SetTransformerInputConvertPass();
+  virtual ~SetTransformerInputConvertPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"transformer_input_convert_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
new file mode 100644
index 0000000000000..c974d334a8de0
--- /dev/null
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
@@ -0,0 +1,302 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/yolo_box_fuse_pass.h"
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node;
+
+namespace patterns {
+struct YoloBoxPattern : public PatternBase {
+  YoloBoxPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, name_scope) {
+    // elementwise_div pattern
+    auto* elt_div_in_x = pattern->NewNode(elt_div_in_x_repr())
+                             ->assert_is_op_input("elementwise_div", "X");
+    auto* elt_div_in_y = pattern->NewNode(elt_div_in_y_repr())
+                             ->assert_is_op_input("elementwise_div", "Y");
+    auto* elt_div =
+        pattern->NewNode(elt_div_repr())->assert_is_op("elementwise_div");
+    auto* elt_div_out = pattern->NewNode(elt_div_out_repr())
+                            ->assert_is_op_output("elementwise_div", "Out")
+                            ->assert_is_op_input("cast", "X");
+    elt_div->LinksFrom({elt_div_in_x, elt_div_in_y}).LinksTo({elt_div_out});
+    // cast pattern
+    auto* cast = pattern->NewNode(cast_repr())->assert_is_op("cast");
+    auto* cast_out = pattern->NewNode(cast_out_repr())
+                         ->assert_is_op_output("cast", "Out")
+                         ->assert_is_op_input("yolo_box", "ImgSize");
+    cast->LinksFrom({elt_div_out}).LinksTo({cast_out});
+// 3 * (yolo_box + transpose) pattern
+#define YOLO_BOX_TRANSPOSE_PATTERN(idx_)                                       \
+  auto* yolo_box##idx_##_in_x = pattern->NewNode(yolo_box##idx_##_in_x_repr()) \
+                                    ->assert_is_op_input("yolo_box", "X");     \
+  auto* yolo_box##idx_ =                                                       \
+      pattern->NewNode(yolo_box##idx_##_repr())->assert_is_op("yolo_box");     \
+  auto* yolo_box##idx_##_out_boxes =                                           \
+      pattern->NewNode(yolo_box##idx_##_out_boxes_repr())                      \
+          ->assert_is_op_output("yolo_box", "Boxes")                           \
+          ->assert_is_op_nth_input("concat", "X", idx_);                       \
+  auto* yolo_box##idx_##_out_scores =                                          \
+      pattern->NewNode(yolo_box##idx_##_out_scores_repr())                     \
+          ->assert_is_op_output("yolo_box", "Scores")                          \
+          ->assert_is_op_input("transpose2", "X");                             \
+  yolo_box##idx_->LinksFrom({yolo_box##idx_##_in_x, cast_out})                 \
+      .LinksTo({yolo_box##idx_##_out_boxes, yolo_box##idx_##_out_scores});     \
+  auto* transpose##idx_ =                                                      \
+      pattern->NewNode(transpose##idx_##_repr())->assert_is_op("transpose2");  \
+  auto* transpose##idx_##_out =                                                \
+      pattern->NewNode(transpose##idx_##_out_repr())                           \
+          ->assert_is_op_output("transpose2", "Out")                           \
+          ->assert_is_op_nth_input("concat", "X", idx_);                       \
+  auto* transpose##idx_##_out_xshape =                                         \
+      pattern->NewNode(transpose##idx_##_out_xshape_repr())                    \
+          ->assert_is_op_output("transpose2", "XShape");                       \
+  transpose##idx_->LinksFrom({yolo_box##idx_##_out_scores})                    \
+      .LinksTo({transpose##idx_##_out, transpose##idx_##_out_xshape});
+    YOLO_BOX_TRANSPOSE_PATTERN(0);
+    YOLO_BOX_TRANSPOSE_PATTERN(1);
+    YOLO_BOX_TRANSPOSE_PATTERN(2);
+#undef YOLO_BOX_TRANSPOSE_PATTERN
+    // concat0 pattern
+    auto* concat0 = pattern->NewNode(concat0_repr())->assert_is_op("concat");
+    auto* concat0_out = pattern->NewNode(concat0_out_repr())
+                            ->assert_is_op_output("concat", "Out")
+                            ->assert_is_op_input("multiclass_nms3", "BBoxes");
+    concat0
+        ->LinksFrom(
+            {yolo_box0_out_boxes, yolo_box1_out_boxes, yolo_box2_out_boxes})
+        .LinksTo({concat0_out});
+    // concat1 pattern
+    auto* concat1 = pattern->NewNode(concat1_repr())->assert_is_op("concat");
+    auto* concat1_out = pattern->NewNode(concat1_out_repr())
+                            ->assert_is_op_output("concat", "Out")
+                            ->assert_is_op_input("multiclass_nms3", "Scores");
+    concat1->LinksFrom({transpose0_out, transpose1_out, transpose2_out})
+        .LinksTo({concat1_out});
+    // nms pattern
+    auto* nms = pattern->NewNode(nms_repr())->assert_is_op("multiclass_nms3");
+    auto* nms_out = pattern->NewNode(nms_out_repr())
+                        ->assert_is_op_output("multiclass_nms3", "Out");
+    auto* nms_out_index = pattern->NewNode(nms_out_index_repr())
+                              ->assert_is_op_output("multiclass_nms3", "Index");
+    auto* nms_out_rois_num =
+        pattern->NewNode(nms_out_rois_num_repr())
+            ->assert_is_op_output("multiclass_nms3", "NmsRoisNum");
+    nms->LinksFrom({concat0_out, concat1_out})
+        .LinksTo({nms_out, nms_out_index, nms_out_rois_num});
+  }
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(elt_div);
+  PATTERN_DECL_NODE(cast);
+  PATTERN_DECL_NODE(yolo_box0);
+  PATTERN_DECL_NODE(yolo_box1);
+  PATTERN_DECL_NODE(yolo_box2);
+  PATTERN_DECL_NODE(concat0);
+  PATTERN_DECL_NODE(transpose0);
+  PATTERN_DECL_NODE(transpose1);
+  PATTERN_DECL_NODE(transpose2);
+  PATTERN_DECL_NODE(concat1);
+  PATTERN_DECL_NODE(nms);
+  // declare variable node's name
+  PATTERN_DECL_NODE(elt_div_in_x);
+  PATTERN_DECL_NODE(elt_div_in_y);
+  PATTERN_DECL_NODE(elt_div_out);
+  PATTERN_DECL_NODE(cast_out);
+  PATTERN_DECL_NODE(yolo_box0_in_x);
+  PATTERN_DECL_NODE(yolo_box1_in_x);
+  PATTERN_DECL_NODE(yolo_box2_in_x);
+  PATTERN_DECL_NODE(yolo_box0_out_boxes);
+  PATTERN_DECL_NODE(yolo_box1_out_boxes);
+  PATTERN_DECL_NODE(yolo_box2_out_boxes);
+  PATTERN_DECL_NODE(yolo_box0_out_scores);
+  PATTERN_DECL_NODE(yolo_box1_out_scores);
+  PATTERN_DECL_NODE(yolo_box2_out_scores);
+  PATTERN_DECL_NODE(concat0_out);
+  PATTERN_DECL_NODE(transpose0_out);
+  PATTERN_DECL_NODE(transpose1_out);
+  PATTERN_DECL_NODE(transpose2_out);
+  PATTERN_DECL_NODE(transpose0_out_xshape);
+  PATTERN_DECL_NODE(transpose1_out_xshape);
+  PATTERN_DECL_NODE(transpose2_out_xshape);
+  PATTERN_DECL_NODE(concat1_out);
+  PATTERN_DECL_NODE(nms_out);
+  PATTERN_DECL_NODE(nms_out_index);
+  PATTERN_DECL_NODE(nms_out_rois_num);
+};
+}  // namespace patterns
+
+YoloBoxFusePass::YoloBoxFusePass() {}
+
+void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  GraphPatternDetector gpd;
+  patterns::YoloBoxPattern yolo_box_pattern(gpd.mutable_pattern(), name_scope_);
+  int found_subgraph_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle YoloBoxFusePass fuse";
+#define GET_IR_NODE(node_) \
+  GET_IR_NODE_FROM_SUBGRAPH(node_, node_, yolo_box_pattern)
+    GET_IR_NODE(elt_div);
+    GET_IR_NODE(cast);
+    GET_IR_NODE(yolo_box0);
+    GET_IR_NODE(yolo_box1);
+    GET_IR_NODE(yolo_box2);
+    GET_IR_NODE(concat0);
+    GET_IR_NODE(transpose0);
+    GET_IR_NODE(transpose1);
+    GET_IR_NODE(transpose2);
+    GET_IR_NODE(concat1);
+    GET_IR_NODE(nms);
+    GET_IR_NODE(elt_div_in_x);
+    GET_IR_NODE(elt_div_in_y);
+    GET_IR_NODE(elt_div_out);
+    GET_IR_NODE(cast_out);
+    GET_IR_NODE(yolo_box0_in_x);
+    GET_IR_NODE(yolo_box1_in_x);
+    GET_IR_NODE(yolo_box2_in_x);
+    GET_IR_NODE(yolo_box0_out_boxes);
+    GET_IR_NODE(yolo_box1_out_boxes);
+    GET_IR_NODE(yolo_box2_out_boxes);
+    GET_IR_NODE(yolo_box0_out_scores);
+    GET_IR_NODE(yolo_box1_out_scores);
+    GET_IR_NODE(yolo_box2_out_scores);
+    GET_IR_NODE(concat0_out);
+    GET_IR_NODE(transpose0_out);
+    GET_IR_NODE(transpose1_out);
+    GET_IR_NODE(transpose2_out);
+    GET_IR_NODE(transpose0_out_xshape);
+    GET_IR_NODE(transpose1_out_xshape);
+    GET_IR_NODE(transpose2_out_xshape);
+    GET_IR_NODE(concat1_out);
+    GET_IR_NODE(nms_out);
+    GET_IR_NODE(nms_out_index);
+    GET_IR_NODE(nms_out_rois_num);
+#undef GET_IR_NODE
+
+// create yolo_box_head
+#define CREATE_YOLO_BOX_HEAD(idx_)                                         \
+  framework::OpDesc yolo_box_head##idx_##_op_desc;                         \
+  yolo_box_head##idx_##_op_desc.SetType("yolo_box_head");                  \
+  yolo_box_head##idx_##_op_desc.SetInput("X",                              \
+                                         {yolo_box##idx_##_in_x->Name()}); \
+  yolo_box_head##idx_##_op_desc.SetAttr(                                   \
+      "anchors", yolo_box##idx_->Op()->GetAttr("anchors"));                \
+  yolo_box_head##idx_##_op_desc.SetAttr(                                   \
+      "class_num", yolo_box##idx_->Op()->GetAttr("class_num"));            \
+  yolo_box_head##idx_##_op_desc.SetOutput(                                 \
+      "Out", {yolo_box##idx_##_out_boxes->Name()});                        \
+  yolo_box_head##idx_##_op_desc.Flush();                                   \
+  auto* yolo_box_head##idx_ =                                              \
+      graph->CreateOpNode(&yolo_box_head##idx_##_op_desc);                 \
+  IR_NODE_LINK_TO(yolo_box##idx_##_in_x, yolo_box_head##idx_);             \
+  IR_NODE_LINK_TO(yolo_box_head##idx_, yolo_box##idx_##_out_boxes);
+    CREATE_YOLO_BOX_HEAD(0);
+    CREATE_YOLO_BOX_HEAD(1);
+    CREATE_YOLO_BOX_HEAD(2);
+#undef CREATE_YOLO_BOX_HEAD
+
+    // create yolo_box_post
+    framework::OpDesc yolo_box_post_op_desc;
+    yolo_box_post_op_desc.SetType("yolo_box_post");
+    yolo_box_post_op_desc.SetInput("Boxes0", {yolo_box0_out_boxes->Name()});
+    yolo_box_post_op_desc.SetInput("Boxes1", {yolo_box1_out_boxes->Name()});
+    yolo_box_post_op_desc.SetInput("Boxes2", {yolo_box2_out_boxes->Name()});
+    yolo_box_post_op_desc.SetInput("ImageShape", {elt_div_in_x->Name()});
+    yolo_box_post_op_desc.SetInput("ImageScale", {elt_div_in_y->Name()});
+    yolo_box_post_op_desc.SetAttr("anchors0",
+                                  yolo_box0->Op()->GetAttr("anchors"));
+    yolo_box_post_op_desc.SetAttr("anchors1",
+                                  yolo_box1->Op()->GetAttr("anchors"));
+    yolo_box_post_op_desc.SetAttr("anchors2",
+                                  yolo_box2->Op()->GetAttr("anchors"));
+    yolo_box_post_op_desc.SetAttr("class_num",
+                                  yolo_box0->Op()->GetAttr("class_num"));
+    yolo_box_post_op_desc.SetAttr("conf_thresh",
+                                  yolo_box0->Op()->GetAttr("conf_thresh"));
+    yolo_box_post_op_desc.SetAttr("downsample_ratio0",
+                                  yolo_box0->Op()->GetAttr("downsample_ratio"));
+    yolo_box_post_op_desc.SetAttr("downsample_ratio1",
+                                  yolo_box1->Op()->GetAttr("downsample_ratio"));
+    yolo_box_post_op_desc.SetAttr("downsample_ratio2",
+                                  yolo_box2->Op()->GetAttr("downsample_ratio"));
+    yolo_box_post_op_desc.SetAttr("clip_bbox",
+                                  yolo_box0->Op()->GetAttr("clip_bbox"));
+    yolo_box_post_op_desc.SetAttr("scale_x_y",
+                                  yolo_box0->Op()->GetAttr("scale_x_y"));
+    yolo_box_post_op_desc.SetAttr("nms_threshold",
+                                  nms->Op()->GetAttr("nms_threshold"));
+    yolo_box_post_op_desc.SetOutput("Out", {nms_out->Name()});
+    yolo_box_post_op_desc.SetOutput("NmsRoisNum", {nms_out_rois_num->Name()});
+    auto* yolo_box_post = graph->CreateOpNode(&yolo_box_post_op_desc);
+    IR_NODE_LINK_TO(yolo_box0_out_boxes, yolo_box_post);
+    IR_NODE_LINK_TO(yolo_box1_out_boxes, yolo_box_post);
+    IR_NODE_LINK_TO(yolo_box2_out_boxes, yolo_box_post);
+    IR_NODE_LINK_TO(elt_div_in_x, yolo_box_post);
+    IR_NODE_LINK_TO(elt_div_in_y, yolo_box_post);
+    IR_NODE_LINK_TO(yolo_box_post, nms_out);
+    IR_NODE_LINK_TO(yolo_box_post, nms_out_rois_num);
+
+    // delete useless node
+    GraphSafeRemoveNodes(graph, {elt_div,
+                                 cast,
+                                 yolo_box0,
+                                 yolo_box1,
+                                 yolo_box2,
+                                 concat0,
+                                 transpose0,
+                                 transpose1,
+                                 transpose2,
+                                 concat1,
+                                 nms,
+                                 elt_div_out,
+                                 cast_out,
+                                 yolo_box0_out_scores,
+                                 yolo_box1_out_scores,
+                                 yolo_box2_out_scores,
+                                 concat0_out,
+                                 transpose0_out,
+                                 transpose1_out,
+                                 transpose2_out,
+                                 transpose0_out_xshape,
+                                 transpose1_out_xshape,
+                                 transpose2_out_xshape,
+                                 concat1_out,
+                                 nms_out_index});
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(yolo_box_fuse_pass, paddle::framework::ir::YoloBoxFusePass);
diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.h b/paddle/fluid/framework/ir/yolo_box_fuse_pass.h
new file mode 100644
index 0000000000000..51dea2431f252
--- /dev/null
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+
+/*
+1. before fuse:
+  div
+   |
+  cast-----------|-------------|
+   |             |             |
+yolo_box      yolo_box      yolo_box
+   |             |             |
+transpose-|   transpose-|   transpose-|
+   |------|-----|-------|------|      |
+          |   concat    |             |
+          |-----|-------|-------------|
+                |     cocnat
+                |-------|
+                       nms3
+
+2. after fuse:
+yolo_box_head      yolo_box_head      yolo_box_head
+      |------------------|------------------|
+                    yolo_box_post
+*/
+class YoloBoxFusePass : public FusePassBase {
+ public:
+  YoloBoxFusePass();
+  virtual ~YoloBoxFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  std::string name_scope_{"yolo_box_fuse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index 392d6c78f9c70..fb79712d47d9e 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -408,6 +408,7 @@ int StatisticsEngine::Stat(const platform::NodeTrees& trees) {
         // See InterpreterCore::RunInstruction for details.
         if (child->Type() == platform::TracerEventType::Operator &&
             cur_node->Name() == "compute") {
+          removed.insert(cur_node);
           removed.insert(child);
         }
         q.push(child);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 6735406aacde7..da2fd0c8c6114 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -277,7 +277,7 @@ void InterpreterCore::Convert(
   }
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    // checkout ouput
+    // checkout output
     for (auto& item : vec_instruction_[i].Outputs()) {
       for (auto var_id : item.second) {
         if (input_var2op_info_.at(var_id).size() == 0) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index d6de37a72c772..f601a4ad28bd7 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -666,7 +666,7 @@ std::map<int, std::list<int>> get_downstream_map(
   VLOG(6) << "downstream count: " << downstream_map_count();
   VLOG(6) << "downstream_map: " << std::endl << downstream_map_to_str();
 
-  // step2: remove unneccessary downstream ops
+  // step2: remove unnecessary downstream ops
   // for example, a->b->c
   // a: b, c
   // b: c
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.h b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
index e9c658e3b9dc6..2c2576528fe0e 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
@@ -89,7 +89,7 @@ struct WorkQueueOptions {
   // If you need to blocking the calling  thread to wait "queue empty", set
   // track_task = true and set events_waiter. EventsWaiter::WaitEvent will
   // block the calling thread until any of events (including "queue empty")
-  // occured.
+  // occurred.
   bool track_task;
   // If you need to be noticed when a WorkQueue Destruct() , set detached =
   // false and set events_waiter.
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index a2e9d972c48bc..8b1f0942f820a 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -82,9 +82,9 @@ class OpKernelType {
 
 inline std::ostream& operator<<(std::ostream& os,
                                 const OpKernelType& kernel_key) {
-  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
-     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
-     << "]:library_type[" << kernel_key.library_type_ << "]";
+  os << "{data_type[" << kernel_key.data_type_ << "]; data_layout["
+     << kernel_key.data_layout_ << "]; place[" << kernel_key.place_
+     << "]; library_type[" << kernel_key.library_type_ << "]}";
   return os;
 }
 
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index 3879a7957600d..20f695d40568e 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -27,16 +27,15 @@ TEST(OpKernelType, ToString) {
                               LibraryType::kCUDNN);
 
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
-            "data_type[float]:data_layout[NCHW]:place[Place(cpu)]:library_type["
-            "CUDNN]");
+            "{data_type[float]; data_layout[NCHW]; place[Place(cpu)]; "
+            "library_type[CUDNN]}");
 
   using CUDAPlace = paddle::platform::CUDAPlace;
   OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
                                LibraryType::kCUDNN);
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
-            "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
-            "Place(gpu:0)]:library_"
-            "type[CUDNN]");
+            "{data_type[::paddle::platform::float16]; data_layout[NCHW]; "
+            "place[Place(gpu:0)]; library_type[CUDNN]}");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index e259d6d417a5c..295510cdb1cf2 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -487,7 +487,7 @@ void AddLinkToCinnOp(const GraphNodeSet& cluster_inputs,
 void AddCinnOpToGraph(const GraphNodeSet& cluster,
                       const GraphNodeSet& cluster_inputs,
                       const GraphNodeSet& cluster_outputs,
-                      const std::string& compilation_key,
+                      int64_t compilation_key,
                       const std::unordered_set<std::string>& deny_var_set,
                       Graph* graph) {
   // Add the cinn launch op
@@ -511,7 +511,7 @@ void AddCinnOpToGraph(const GraphNodeSet& cluster,
                        ExtractOpRole(cluster));
   cinn_op_desc.Flush();
   auto* cinn_op_node = graph->CreateOpNode(&cinn_op_desc);
-  // Add new links from or to the the cinn launch op node
+  // Add new links from or to the cinn launch op node
   AddLinkToCinnOp(cluster_inputs, cluster_outputs, cinn_op_node);
 
   VLOG(4) << "Add op [" << kCinnLaunchOp << "] into graph.";
@@ -536,7 +536,7 @@ void RemoveSubGraphFromGraph(const GraphNodeSet& cluster,
 void ReplaceSubGraphWithCinnOpNode(
     const GraphNodeSet& cluster, const GraphNodeSet& cluster_inputs,
     const GraphNodeSet& cluster_outputs, const GraphNodeSet& cluster_internals,
-    const std::string& compilation_key,
+    int64_t compilation_key,
     const std::unordered_set<std::string>& deny_var_set, Graph* graph) {
   // Add the cinn op node whose name is "kCinnLaunchOp" into graph
   AddCinnOpToGraph(cluster, cluster_inputs, cluster_outputs, compilation_key,
@@ -545,6 +545,15 @@ void ReplaceSubGraphWithCinnOpNode(
   RemoveSubGraphFromGraph(cluster, cluster_internals, graph);
 }
 
+static bool IsInplaceOp(const OpDesc& op_desc) {
+  auto inputs = op_desc.InputArgumentNames();
+  std::unordered_set<std::string> input_set(inputs.begin(), inputs.end());
+  for (auto& name : op_desc.OutputArgumentNames()) {
+    if (input_set.count(name) > 0) return true;
+  }
+  return false;
+}
+
 // Search all subgraphs which all op node supported by CINN,
 // Here we using SubgraphDetector to detecte the subgraph that
 // all of op node supported by CINN. We using OpMapperRegistry
@@ -565,9 +574,10 @@ void SearchAllSubgraphs(Graph* graph) {
     if (deny_ops.size()) {
       return registered && !deny_ops.count(node->Name());
     }
+
     // if the user doesn't set FLAGS_allow_cinn_ops and FLAGS_deny_cinn_ops,
     // return true only when it is registered in CINN
-    return registered;
+    return registered && (node->IsOp() && !IsInplaceOp(*node->Op()));
   };
   VLOG(4) << "The allowed Cinn Ops: " << FLAGS_allow_cinn_ops;
   VLOG(4) << "The denied Cinn Ops: " << FLAGS_deny_cinn_ops;
@@ -603,7 +613,7 @@ void SearchAllSubgraphs(Graph* graph) {
 
     // Create a new subgraph according to the found cluster and
     // save it in CinnCompiler
-    std::string compilation_key = cinn_compiler->AddGraph(CreateNewSubGraph(
+    auto compilation_key = cinn_compiler->AddGraph(CreateNewSubGraph(
         cluster_set, cluster_internals, cluster_inputs, cluster_outputs));
     VLOG(4) << "Compilation Key:\n"
             << cinn_compiler->ReadableKey(compilation_key);
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index c11c7124b6277..d593aadc02c73 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -90,12 +90,12 @@ inline bool CheckGraphIndependence(const std::unordered_set<Node*>& nodes) {
 }
 
 // Get compilation_key values
-std::vector<std::string> GetCompilationKeys(const Graph& graph) {
-  std::vector<std::string> compilation_keys;
+std::vector<int64_t> GetCompilationKeys(const Graph& graph) {
+  std::vector<int64_t> compilation_keys;
   for (auto& node : graph.Nodes()) {
     if (node->IsOp() && node->Name() == kCinnLaunchOp) {
       compilation_keys.emplace_back(BOOST_GET_CONST(
-          std::string, node->Op()->GetAttr(operators::kCompilationKey)));
+          int64_t, node->Op()->GetAttr(operators::kCompilationKey)));
     }
   }
   return compilation_keys;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index 499d243b25f8f..9b5ce876c256f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -18,6 +18,7 @@
 #include <functional>
 #include <map>
 #include <set>
+#include <sstream>
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -77,22 +78,17 @@ bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
          input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
 }
 
-size_t CinnCacheKey::Hash::hash_combine(size_t seed, size_t value) {
-  return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2));
-}
-
 size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
-  std::size_t ret = 0;
+  std::ostringstream has_str;
 
-  std::hash<std::string> string_hasher;
   for (const auto& name_shape : key.input_shapes_) {
-    ret = hash_combine(ret, string_hasher(name_shape.first));
-    ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
+    has_str << name_shape.first;
+    has_str << name_shape.second.to_str();
   }
 
-  ret = hash_combine(ret, key.graph_hash_val_);
-  ret = hash_combine(ret, string_hasher(key.arch_str_));
-  return ret;
+  has_str << key.graph_hash_val_;
+  has_str << key.arch_str_;
+  return std::hash<std::string>()(has_str.str());
 }
 
 size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
index 239e9e561c9fc..d87ea843b9e7d 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -58,7 +58,6 @@ class CinnCacheKey {
   bool operator!=(const CinnCacheKey& other) const;
 
   struct Hash {
-    static size_t hash_combine(size_t seed, size_t value);
     size_t operator()(const CinnCacheKey& key) const;
   };
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 51dca93c7c7f0..12f603542066f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -61,8 +61,8 @@ using ::cinn::hlir::framework::BuildScope;
 using ::cinn::hlir::framework::GraphCompiler;
 
 CinnCompiler* CinnCompiler::GetInstance() {
-  static CinnCompiler instance;
-  return &instance;
+  static CinnCompiler* instance = new CinnCompiler();
+  return instance;
 }
 
 const CinnCompiledObject& CinnCompiler::Compile(
@@ -110,7 +110,7 @@ const CinnCompiledObject& CinnCompiler::Compile(
 }
 
 const CinnCompiledObject& CinnCompiler::Compile(
-    const std::string& compilation_key,
+    int64_t compilation_key,
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const Target& target, void* stream) {
   const auto& graph = FindGraph(compilation_key);
@@ -126,12 +126,8 @@ const CinnCompiledObject& CinnCompiler::GetCompiledObject(
   return *res->second;
 }
 
-std::string CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
-  std::string graph_key;
-  ProgramDesc program;
-  GraphToProgram(*graph, &program);
-  program.Proto()->SerializeToString(&graph_key);
-
+int64_t CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
+  int64_t graph_key = std::hash<Graph*>()((&(*graph)));
   PADDLE_ENFORCE_EQ(
       graphs_.count(graph_key), 0,
       platform::errors::PreconditionNotMet(
@@ -143,16 +139,17 @@ std::string CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
   return graph_key;
 }
 
-const Graph& CinnCompiler::FindGraph(const std::string& graph_key) const {
+const Graph& CinnCompiler::FindGraph(int64_t graph_key) const {
+  auto it = graphs_.find(graph_key);
   PADDLE_ENFORCE_NE(
-      graphs_.count(graph_key), 0,
+      it, graphs_.end(),
       platform::errors::PreconditionNotMet(
-          "Can not find the target graph, of which the key is:\n%s",
-          ReadableKey(graph_key).c_str()));
-  return *graphs_.at(graph_key);
+          "Can not find the target graph, of which the key is: %lld",
+          graph_key));
+  return *it->second;
 }
 
-std::string CinnCompiler::VizGraph(const std::string& graph_key) const {
+std::string CinnCompiler::VizGraph(int64_t graph_key) const {
   const Graph& graph = FindGraph(graph_key);
   return VizGraph(graph);
 }
@@ -200,11 +197,24 @@ std::string CinnCompiler::VizGraph(const Graph& graph) const {
   return dot.Build();
 }
 
-std::string CinnCompiler::ReadableKey(
-    const std::string& compilation_key) const {
-  proto::ProgramDesc desc;
-  desc.ParseFromString(compilation_key);
-  return desc.DebugString();
+std::string CinnCompiler::SerializeKey(int64_t compilation_key) const {
+  const auto& graph = FindGraph(compilation_key);
+
+  ProgramDesc program;
+  GraphToProgram(graph, &program);
+
+  std::string serial_graph;
+  program.Proto()->SerializeToString(&serial_graph);
+  return serial_graph;
+}
+
+std::string CinnCompiler::ReadableKey(int64_t compilation_key) const {
+  const auto& graph = FindGraph(compilation_key);
+
+  ProgramDesc program;
+  GraphToProgram(graph, &program);
+
+  return program.Proto()->DebugString();
 }
 
 void CinnCompiler::Clear() {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 7e5df6faf0819..a38e8b4c5f674 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -78,21 +78,23 @@ class CinnCompiler {
       const ::cinn::common::Target& target, void* stream = nullptr);
 
   const CinnCompiledObject& Compile(
-      const std::string& compilation_key,
+      int64_t compilation_key,
       const std::map<std::string, const LoDTensor*>& input_tensors,
       const ::cinn::common::Target& target, void* stream = nullptr);
 
   const CinnCompiledObject& GetCompiledObject(int64_t cached_index) const;
 
-  std::string AddGraph(std::unique_ptr<ir::Graph> graph);
+  int64_t AddGraph(std::unique_ptr<ir::Graph> graph);
 
-  const ir::Graph& FindGraph(const std::string& graph_key) const;
+  const ir::Graph& FindGraph(int64_t graph_key) const;
 
-  std::string VizGraph(const std::string& graph_key) const;
+  std::string VizGraph(int64_t graph_key) const;
 
   std::string VizGraph(const ir::Graph& graph) const;
 
-  std::string ReadableKey(const std::string& compilation_key) const;
+  std::string SerializeKey(int64_t compilation_key) const;
+
+  std::string ReadableKey(int64_t compilation_key) const;
 
   void Clear();
 
@@ -115,7 +117,7 @@ class CinnCompiler {
       const std::map<std::string, const LoDTensor*>& input_tensors,
       const CinnCompiledObject& compiled_obj) const;
 
-  std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
+  std::unordered_map<int64_t, std::unique_ptr<ir::Graph>> graphs_;
   std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
       cache_by_address_;
   std::unordered_map<CinnCacheKeyByStructure, std::int64_t, CinnCacheKey::Hash>
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 44f4424d70d4c..255e318c9fa69 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -59,12 +59,12 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T, Alloc>& vec) {
 }
 
 // Get compilation_key values
-std::vector<std::string> GetCompilationKeys(const Graph& graph) {
-  std::vector<std::string> compilation_keys;
+std::vector<int64_t> GetCompilationKeys(const Graph& graph) {
+  std::vector<int64_t> compilation_keys;
   for (auto& node : graph.Nodes()) {
     if (node->IsOp() && node->Name() == kCinnLaunchOp) {
       compilation_keys.emplace_back(BOOST_GET_CONST(
-          std::string, node->Op()->GetAttr(operators::kCompilationKey)));
+          int64_t, node->Op()->GetAttr(operators::kCompilationKey)));
     }
   }
   return compilation_keys;
@@ -83,13 +83,12 @@ std::unordered_set<std::string> ExtractOpTypes(const Graph& graph) {
 
 // Get inputs info
 std::unordered_map<std::string, std::vector<int64_t>> GetInputsInfo(
-    const std::string& key, const Graph& graph) {
+    int64_t key, const Graph& graph) {
   std::unordered_set<std::string> inputs;
   for (auto& node : graph.Nodes()) {
     if (node->IsOp() && node->Name() == kCinnLaunchOp) {
-      if (BOOST_GET_CONST(std::string,
-                          node->Op()->GetAttr(operators::kCompilationKey)) !=
-          key) {
+      if (BOOST_GET_CONST(int64_t, node->Op()->GetAttr(
+                                       operators::kCompilationKey)) != key) {
         continue;
       }
       for (auto in_var_name : node->Op()->InputArgumentNames()) {
@@ -251,8 +250,7 @@ TEST(CinnCompilerTest, Compile) {
   const auto& compiling_graph = cinn_compiler->FindGraph(compilation_key);
   viz_graph("compiling_graph.dot", const_cast<Graph*>(&compiling_graph));
 
-  EXPECT_THROW(cinn_compiler->FindGraph("no_existed"),
-               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(cinn_compiler->FindGraph(0), paddle::platform::EnforceNotMet);
 
   auto inputs_info = GetInputsInfo(compilation_key, *graph);
   std::unordered_map<std::string, LoDTensor> create_inputs;
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index efbab83f7d0e8..4c95f01ae569f 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -421,7 +421,7 @@ void PruneBackwardImpl(proto::BlockDesc* origin, proto::BlockDesc* pruned) {
   for (const auto& name : var_names) {
     if (var_map.count(name)) {
       // NOTE(zhiqiu): For operator in a conditional block, the related vars
-      // may not exist in current block, but in its futher block.
+      // may not exist in current block, but in its further block.
       *pruned_vars->Add() = var_map[name];
     }
   }
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 9b12870a2bb9b..aec40a5a7ebdd 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -95,8 +95,46 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
   return;
 }
 
+void add_sparse_optimizer(
+    std::unordered_map<std::string, float>& config,  // NOLINT
+    const ::paddle::SparseCommonSGDRuleParameter& sgd_param,
+    const std::string& prefix = "") {
+  auto optimizer_name = sgd_param.name();
+  if (optimizer_name == "naive") {
+    config[prefix + "learning_rate"] = sgd_param.naive().learning_rate();
+    config[prefix + "initial_range"] = sgd_param.naive().initial_range();
+    if (sgd_param.naive().weight_bounds_size() == 2) {
+      config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1];
+    }
+  } else if (optimizer_name == "adagrad") {
+    config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
+    config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
+    config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
+    if (sgd_param.adagrad().weight_bounds_size() == 2) {
+      config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
+    }
+  } else if (optimizer_name == "std_adagrad") {
+    config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
+    config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
+    config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
+    if (sgd_param.adagrad().weight_bounds_size() == 2) {
+      config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
+    }
+  } else if (optimizer_name == "adam") {
+    config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
+    config[prefix + "initial_range"] = sgd_param.adam().initial_range();
+    if (sgd_param.adam().weight_bounds_size() == 2) {
+      config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
+    }
+  }
+}
+
 void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
-  // add for hbmps optimizer config
+  // optimizer config for hbmps
   auto fleet_desc_str = trainer_desc.fleet_desc();
   google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
   auto sparse_table =
@@ -105,7 +143,7 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
   auto sparse_table_accessor_parameter =
       sparse_table_accessor.downpour_accessor_param();
   auto accessor_class = sparse_table_accessor.accessor_class();
-  // gpups' sparse table optimizer config
+  // NOTE(zhangminxu): gpups' sparse table optimizer config,
   // now only support single sparse table
   // auto sparse_table = param_.sparse_table(0);
   std::unordered_map<std::string, float> config;
@@ -126,7 +164,14 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
       config["max_bound"] =
           sparse_table_accessor.sparse_sgd_param().weight_bounds()[1];
     }
+    // NOTE(zhangminxu): for DownpourCtrAccessor & DownpourCtrDoubleAccessor,
+    // optimizer config for embed_w & embedx_w is the same
     config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+    config["mf_learning_rate"] = config["learning_rate"];
+    config["mf_initial_g2sum"] = config["initial_g2sum"];
+    config["mf_initial_range"] = config["initial_range"];
+    config["mf_min_bound"] = config["min_bound"];
+    config["mf_max_bound"] = config["max_bound"];
   } else if (accessor_class == "DownpourSparseValueAccessor") {
     auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name();
     if (optimizer_name == "naive") {
@@ -186,71 +231,12 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
              accessor_class == "DownpourDoubleUnitAccessor") {
     config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
     config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
-    auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name();
-    if (optimizer_name == "naive") {
-      config["mf_learning_rate"] =
-          sparse_table_accessor.embedx_sgd_param().naive().learning_rate();
-      config["mf_initial_range"] =
-          sparse_table_accessor.embedx_sgd_param().naive().initial_range();
-      if (sparse_table_accessor.embedx_sgd_param()
-              .naive()
-              .weight_bounds_size() == 2) {
-        config["mf_min_bound"] =
-            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0];
-        config["mf_max_bound"] =
-            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1];
-      }
-    } else if (optimizer_name == "adagrad") {
-      config["mf_learning_rate"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
-      config["mf_initial_range"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
-      config["mf_initial_g2sum"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
-      if (sparse_table_accessor.embedx_sgd_param()
-              .adagrad()
-              .weight_bounds_size() == 2) {
-        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
-                                     .adagrad()
-                                     .weight_bounds()[0];
-        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
-                                     .adagrad()
-                                     .weight_bounds()[1];
-      }
-    } else if (optimizer_name == "std_adagrad") {
-      config["mf_learning_rate"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
-      config["mf_initial_range"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
-      config["mf_initial_g2sum"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
-      if (sparse_table_accessor.embedx_sgd_param()
-              .adagrad()
-              .weight_bounds_size() == 2) {
-        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
-                                     .adagrad()
-                                     .weight_bounds()[0];
-        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
-                                     .adagrad()
-                                     .weight_bounds()[1];
-      }
-    } else if (optimizer_name == "adam") {
-      config["mf_learning_rate"] =
-          sparse_table_accessor.embedx_sgd_param().adam().learning_rate();
-      config["mf_initial_range"] =
-          sparse_table_accessor.embedx_sgd_param().adam().initial_range();
-      if (sparse_table_accessor.embedx_sgd_param()
-              .adam()
-              .weight_bounds_size() == 2) {
-        config["mf_min_bound"] =
-            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0];
-        config["mf_max_bound"] =
-            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1];
-      }
-    }
     config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+    // optimizer config for embed_w and embedx
+    add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
+    add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(),
+                         "mf_");
   }
-
   auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
   ps_gpu_wrapper->InitializeGPUServer(config);
 }
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 5186f8fcc1c51..8ce18d89c9b43 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -180,6 +180,11 @@ void TensorFromArray(const T* src, const size_t& array_size,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_MLU
+  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(dst_place)) {  // NOLINT
     memory::Copy(
@@ -247,9 +252,7 @@ void TensorFromVector(const std::vector<T>& src,
 #endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        dst_place, dst_ptr, src_place, src_ptr, size,
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -448,9 +451,7 @@ inline void TensorToVector(const Tensor& src,
 #endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place, dst_ptr, src.place(), src_ptr, size,
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 2496d4d040e2e..b86b4fec8a571 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/port.h"
 
 #ifdef PADDLE_WITH_PSLIB
-#include <pslib.h>
+#include "proto/ps.pb.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index e928cbb654839..76f64ab73a64b 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -186,7 +186,7 @@ template <typename VarType>
 static void SetForwardDataTypeOfGradVars(const NameVarMap<VarType>& outs) {
   for (auto& var_pair : outs) {
     for (auto& var : var_pair.second) {
-      // NOTE(zhiqu): The ouput may be NULL because of pruning.
+      // NOTE(zhiqu): The output may be NULL because of pruning.
       if (var) {
         SetForwardDataTypeOfGradVar(var);
       }
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 38180ba963c38..cfd3813d60d44 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -317,9 +317,11 @@ PreparedOp PrepareImpl(
                 << " | kernel key: " << pt_cpu_kernel_key
                 << " | kernel: " << pt_cpu_kernel;
         auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace());
-        return PreparedOp(op, empty_ctx, expected_kernel_key, arg_map_fn,
-                          default_kernel_signature, std::move(kernel_signature),
-                          pt_cpu_kernel, cpu_ctx);
+        return PreparedOp(
+            op, empty_ctx,
+            framework::TransPhiKernelKeyToOpKernelType(pt_cpu_kernel_key),
+            arg_map_fn, default_kernel_signature, std::move(kernel_signature),
+            pt_cpu_kernel, cpu_ctx);
       }
     }
   }
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 03fa46eab5367..c7fd2215eb42a 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -879,7 +879,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
 }
 
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
-// fixed as same as multi gpus card trainging.
+// fixed as same as multi gpus card training.
 void Reducer::MarkGroupReady(size_t group_index) {
   PADDLE_ENFORCE_GE(
       group_index, next_group_,
@@ -957,7 +957,7 @@ void Reducer::FusedAllReduceSchedule(const int run_order, Group &group,
 // default stream for communicating, so there exist some problems in
 // synchronization. And need to add a WaitComm there.
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
-// fixed as multi gpus card trainging.
+// fixed as multi gpus card training.
 #ifdef PADDLE_WITH_XPU_BKCL
     if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
       parallel_ctx_->WaitComm(run_order);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7b274339e3cbe..350263bc5457d 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -192,7 +192,7 @@ void Tracer::TraceOpImpl(const std::string& type,
                          paddle::framework::AttributeMap* passed_default_attrs_,
                          bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      "trace_op", platform::TracerEventType::Operator, 1);
+      type, platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 7fae481f58289..633f481df808b 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -109,7 +109,11 @@ endif()
 set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME paddle_inference)
 if(NOT APPLE AND NOT WIN32)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")
+  if (WITH_CUSTOM_DEVICE)
+    set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_custom_device.map")
+  else()
+    set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")
+  endif()
   set_target_properties(paddle_inference_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
   # check symbol hidden
   FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index dab1b9f7b1135..3d1a467565c84 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,5 +1,5 @@
 unset(analysis_deps CACHE)
-set(analysis_deps # analysis_deps can be extended accross the project
+set(analysis_deps # analysis_deps can be extended across the project
         framework_proto proto_desc graph pass paddle_inference_io executor pretty_log
         ir_pass_manager
         CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index 09494a360270b..0c9f8d7e16558 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 /*
- * This file defines the the class to partition a graph.
+ * This file defines the class to partition a graph.
  */
 
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
index 621c631b8539b..21bfe7582061a 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 /*
- * This file defines the the class to partition a graph.
+ * This file defines the class to partition a graph.
  */
 
 #pragma once
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index e4fc52b6fa744..bc7dc9704ac5e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -286,7 +286,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // There are models with the same structure but the different parameters,
   // when running in the 'use_serialize' mode, there is a bug.
   // serialization is affected by max_batch_size, but calibration is not.
-  // So we use seperate engine keys in serialization and calibration.
+  // So we use separate engine keys in serialization and calibration.
   auto engine_key = GenerateEngineKey(
       input_names_with_id, output_names_with_id, std::to_string(0),
       std::to_string(max_batch_size),
@@ -377,12 +377,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));
   trt_engine->SetUseInspector(Get<bool>("use_inspector"));
-
-  trt_engine->SetWithErnie(
-      (graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
-       graph->Has(framework::ir::kMultiheadMatmulPass)) ||
-      (graph->Has(framework::ir::kPrelnEmbEltwiseLayernormPass) &&
-       graph->Has(framework::ir::kMultiheadMatmulPass)));
+  trt_engine->SetWithErnie(graph->Has(framework::ir::kMultiheadMatmulPass));
 
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index edec1b1c7d0e4..56cc4aa755bda 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -50,10 +50,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 if (WITH_ONNXRUNTIME)
-    cc_library(analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+    cc_library(analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
               zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
 else (WITH_ONNXRUNTIME)
-    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+    cc_library(analysis_predictor SRCS analysis_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
               zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
 endif (WITH_ONNXRUNTIME)
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 4827fe6c1ac97..735e1b7be4c1f 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -158,6 +158,19 @@ void AnalysisConfig::EnableNpu(int device_id) {
   Update();
 }
 
+void AnalysisConfig::EnableCustomDevice(const std::string &device_type,
+                                        int device_id) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  use_custom_device_ = true;
+  custom_device_id_ = device_id;
+  custom_device_type_ = device_type;
+#else
+  LOG(ERROR) << "Please compile with CustomDevice to EnableCustomDevice()";
+  use_custom_device_ = false;
+#endif
+  Update();
+}
+
 void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size,
                                bool ipu_enable_pipelining,
                                int ipu_batches_per_step) {
@@ -324,6 +337,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // fleet exe related
   CP_MEMBER(dist_config_);
 
+  // custom device related.
+  CP_MEMBER(use_custom_device_);
+  CP_MEMBER(custom_device_type_);
+  CP_MEMBER(custom_device_id_);
+
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_, false,
                       platform::errors::InvalidArgument(
@@ -539,7 +557,8 @@ void AnalysisConfig::Update() {
   if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) ||
       ((use_xpu() ^ pass_builder_->use_xpu())) ||
       ((use_npu() ^ pass_builder_->use_npu())) ||
-      ((use_ipu() ^ pass_builder_->use_ipu()))) {
+      ((use_ipu() ^ pass_builder_->use_ipu())) ||
+      ((use_custom_device() ^ pass_builder_->use_custom_device()))) {
     if (use_gpu()) {
       pass_builder_.reset(new GpuPassStrategy);
 
@@ -562,6 +581,12 @@ void AnalysisConfig::Update() {
           platform::errors::InvalidArgument(
               "Only one choice can be made between GPU and NPU."));
       pass_builder_.reset(new NpuPassStrategy);
+    } else if (use_custom_device()) {
+      PADDLE_ENFORCE_EQ(
+          use_gpu(), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between GPU and CustomDevice."));
+      pass_builder_.reset(new CustomDevicePassStrategy);
     } else {
       pass_builder_.reset(new CpuPassStrategy);
     }
@@ -588,6 +613,13 @@ void AnalysisConfig::Update() {
               "Only one choice can be made between GPU and NPU."));
       pass_builder_.reset(new NpuPassStrategy(
           *static_cast<NpuPassStrategy *>(pass_builder_.get())));
+    } else if (use_custom_device()) {
+      PADDLE_ENFORCE_EQ(
+          use_gpu(), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between GPU and CustomDevice."));
+      pass_builder_.reset(new CustomDevicePassStrategy(
+          *static_cast<CustomDevicePassStrategy *>(pass_builder_.get())));
     } else {
       pass_builder_.reset(new CpuPassStrategy(
           *static_cast<CpuPassStrategy *>(pass_builder_.get())));
@@ -733,7 +765,13 @@ void AnalysisConfig::Update() {
         "but did not have the option -DWITH_IPU compiled."));
 #endif
   }
-
+  if (use_custom_device_) {
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to enable the custom device "
+        "but did not have the option -DWITH_CUSTOM_DEVICE compiled."));
+#endif
+  }
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4f0d4a908380f..6c81997d13562 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -332,6 +332,15 @@ bool AnalysisPredictor::CreateExecutor() {
     PADDLE_THROW(platform::errors::Unavailable(
         "You tried to use IPU forward propagation, but Paddle was not compiled "
         "with WITH_IPU."));
+#endif
+  } else if (config_.use_custom_device()) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    place_ = paddle::platform::CustomPlace(config_.custom_device_type());
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to use CustomDevice forward propagation, but Paddle was not "
+        "compiled "
+        "with WITH_CUSTOM_DEVICE."));
 #endif
   } else {
     place_ = paddle::platform::CPUPlace();
@@ -1241,6 +1250,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
   } else if (platform::is_npu_place(place_)) {
     auto npu_place = place_;
     res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
+  } else if (platform::is_custom_place(place_)) {
+    auto custom_place = place_;
+    auto paddleplace = static_cast<PaddlePlace>(
+        static_cast<size_t>(PaddlePlace::kCUSTOM) +
+        phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
+    res->SetPlace(paddleplace, custom_place.GetDeviceId());
   } else {
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -1290,6 +1305,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
   } else if (platform::is_npu_place(place_)) {
     auto npu_place = place_;
     res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
+  } else if (platform::is_custom_place(place_)) {
+    auto custom_place = place_;
+    auto paddleplace = static_cast<PaddlePlace>(
+        static_cast<size_t>(PaddlePlace::kCUSTOM) +
+        phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
+    res->SetPlace(paddleplace, custom_place.GetDeviceId());
   } else {
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -1723,6 +1744,8 @@ USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(exp);
+USE_TRT_CONVERTER(log);
 USE_TRT_CONVERTER(sigmoid);
 USE_TRT_CONVERTER(tanh);
 USE_TRT_CONVERTER(fc);
@@ -1754,6 +1777,7 @@ USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
 USE_TRT_CONVERTER(anchor_generator);
 USE_TRT_CONVERTER(yolo_box);
+USE_TRT_CONVERTER(yolo_box_head);
 USE_TRT_CONVERTER(roi_align);
 USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 0c68acfe98047..bb966dc5c6c1b 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -224,8 +224,23 @@ void Tensor::CopyFromCpu(const T *data) {
         "with NPU."));
 #endif
   } else {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    auto device_type_id =
+        static_cast<size_t>(place_) - static_cast<size_t>(PlaceType::kCUSTOM);
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    paddle::platform::CustomPlace custom_place(
+        phi::GetGlobalDeviceType(device_type_id), device_);
+    auto *t_data = tensor->mutable_data<T>(custom_place);
+    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
+        pool.Get(custom_place));
+    paddle::memory::Copy(custom_place, static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(), data, ele_size,
+                         dev_ctx->stream());
+#else
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
         "The analysis predictor supports CPU, GPU, NPU and XPU now."));
+#endif
   }
 }
 
@@ -398,8 +413,20 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
         "with NPU."));
 #endif
   } else {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    auto custom_place = t_place;
+    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
+        pool.Get(custom_place));
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data), custom_place, t_data,
+                         ele_num * sizeof(T), dev_ctx->stream());
+// TODO(wangran16): sync_stream
+#else
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
         "The analysis predictor supports CPU, GPU, NPU and XPU now."));
+#endif
   }
 }
 
diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc
new file mode 100644
index 0000000000000..7706f2d0824e3
--- /dev/null
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/infer_context.h"
+
+namespace paddle {}  // namespace paddle
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
new file mode 100644
index 0000000000000..b7a8bf637d872
--- /dev/null
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+
+namespace paddle {
+
+class InferCPUContext : public phi::CPUContext {
+ public:
+  using phi::CPUContext::SetEigenDevice;
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+class InferGPUContext : public phi::GPUContext {
+ public:
+  using phi::GPUContext::SetStream;
+  using phi::GPUContext::SetEigenDevice;
+  using phi::GPUContext::SetBlasHandle;
+  using phi::GPUContext::SetBlasTensorCoreHandle;
+  using phi::GPUContext::SetBlasTF32Handle;
+  using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetSolverHandle;
+  using phi::GPUContext::SetSparseHandle;
+  // using phi::GPUContext::SetDnnWorkspaceHandle;
+  using phi::GPUContext::SetComputeCapability;
+  using phi::GPUContext::SetMaxThreadsPerMultiProcessor;
+  using phi::GPUContext::SetMultiProcessors;
+  using phi::GPUContext::SetMaxThreadsPerBlock;
+  using phi::GPUContext::SetMaxGridDimSize;
+  using phi::GPUContext::SetDriverVersion;
+  using phi::GPUContext::SetRuntimeVersion;
+};
+#endif
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 3a3e6a0908ea1..4dc80a1d75390 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -571,6 +571,7 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
   auto* builder = predictor_.config_.pass_builder();
   builder->SetPasses({
       "cpu_quantize_pass", "cpu_quantize_squash_pass",
+      "int8_scale_calculation_mkldnn_pass",
   });
   if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
   auto passes = builder->AllPasses();
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 9c48d822b4d0d..af6cf88a3224f 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -332,6 +332,14 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableNpu(int device_id = 0);
   ///
+  /// \brief Turn on CustomDevice.
+  ///
+  /// \param device_type device_type the custom device to use.
+  ///
+  /// \param device_id device_id the custom device to use (default is 0).
+  ///
+  void EnableCustomDevice(const std::string& device_type, int device_id);
+  ///
   /// \brief Turn on ONNXRuntime.
   ///
   void EnableONNXRuntime();
@@ -366,6 +374,11 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \return bool Whether the IPU is turned on.
   ///
   bool use_ipu() const { return use_ipu_; }
+  /// \brief A boolean state telling whether the CustomDevice is turned on.
+  ///
+  /// \return bool Whether the CustomDevice is turned on.
+  ///
+  bool use_custom_device() const { return use_custom_device_; }
   ///
   /// \brief A boolean state telling whether the ONNXRuntime is turned on.
   ///
@@ -397,12 +410,23 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \return int The NPU device id.
   ///
   int npu_device_id() const { return npu_device_id_; }
-  /// \brief Get the the number of IPU device .
+  /// \brief Get the number of IPU device .
   ///
   /// \return int The number of IPU device.
   ///
   int ipu_device_num() const { return ipu_device_num_; }
   ///
+  /// \brief Get the custom device id.
+  ///
+  /// \return int The custom device id.
+  ///
+  int custom_device_id() const { return custom_device_id_; }
+  /// \brief Get the custom device type.
+  ///
+  /// \return string The custom device type.
+  ///
+  std::string custom_device_type() const { return custom_device_type_; }
+  ///
   /// \brief Get the initial size in MB of the GPU memory pool.
   ///
   /// \return int The initial size in MB of the GPU memory pool.
@@ -900,6 +924,11 @@ struct PD_INFER_DECL AnalysisConfig {
   bool use_npu_{false};
   int npu_device_id_{0};
 
+  // CustomDevice related
+  bool use_custom_device_{false};
+  int custom_device_id_{0};
+  std::string custom_device_type_;
+
   // ONNXRuntime related
   bool use_onnxruntime_{false};
   bool enable_ort_optimization_{false};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 01988d5f539dc..77203b069e602 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -98,18 +98,22 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "multihead_matmul_fuse_pass_v3",                //
       "skip_layernorm_fuse_pass",                     //
       "preln_skip_layernorm_fuse_pass",               //
-      "conv_bn_fuse_pass",                            //
-      "unsqueeze2_eltwise_fuse_pass",                 //
-      "trt_squeeze2_matmul_fuse_pass",                //
-      "trt_reshape2_matmul_fuse_pass",                //
-      "trt_flatten2_matmul_fuse_pass",                //
-      "trt_map_matmul_v2_to_mul_pass",                //
-      "trt_map_matmul_v2_to_matmul_pass",             //
-      "trt_map_matmul_to_mul_pass",                   //
-      "fc_fuse_pass",                                 //
-      "conv_elementwise_add_fuse_pass",               //
-      "tensorrt_subgraph_pass",                       //
-      "conv_bn_fuse_pass",                            //
+      // "set_transformer_input_convert_pass",           //
+      "conv_bn_fuse_pass",                 //
+      "unsqueeze2_eltwise_fuse_pass",      //
+      "trt_squeeze2_matmul_fuse_pass",     //
+      "trt_reshape2_matmul_fuse_pass",     //
+      "trt_flatten2_matmul_fuse_pass",     //
+      "trt_map_matmul_v2_to_mul_pass",     //
+      "trt_map_matmul_v2_to_matmul_pass",  //
+      "trt_map_matmul_to_mul_pass",        //
+      "fc_fuse_pass",                      //
+      "conv_elementwise_add_fuse_pass",    //
+      // "remove_padding_recover_padding_pass",          //
+      // "delete_remove_padding_recover_padding_pass",    //
+      // "yolo_box_fuse_pass",      //
+      "tensorrt_subgraph_pass",  //
+      "conv_bn_fuse_pass",       //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we
@@ -282,7 +286,8 @@ void CpuPassStrategy::EnableMKLDNN() {
              "depthwise_conv_mkldnn_pass",    //
              "conv_bn_fuse_pass",             // Execute BN passes again to
              "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_transpose_bn_fuse_pass",   //
+             "conv_affine_channel_mkldnn_fuse_pass",    //
+             "conv_transpose_bn_fuse_pass",             //
              "conv_transpose_eltwiseadd_bn_fuse_pass",  //
              "conv_bias_mkldnn_fuse_pass",              //
              "conv_transpose_bias_mkldnn_fuse_pass",
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index db6bde62ddc7c..f01799c646077 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -166,6 +166,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \return A bool variable implying whether we are in ipu mode.
   bool use_ipu() const { return use_ipu_; }
 
+  /// \brief Check if we are using CustomDevice.
+  /// \return A bool variable implying whether we are in CustomDevice mode.
+  bool use_custom_device() const { return use_custom_device_; }
+
   /// \brief Default destructor.
   virtual ~PassStrategy() = default;
 
@@ -177,6 +181,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   bool use_ipu_{false};
   bool use_mkldnn_{false};
   bool use_gpu_fp16_{false};
+  bool use_custom_device_{false};
   /// \endcond
 };
 
@@ -291,6 +296,22 @@ class PD_INFER_DECL NpuPassStrategy final : public PassStrategy {
   }
 };
 
+/// \class CustomDevicePassStrategy
+/// \brief The CustomDevice passes controller, it is used in AnalysisPredictor
+/// with CustomDevice
+/// mode.
+class PD_INFER_DECL CustomDevicePassStrategy final : public PassStrategy {
+ public:
+  CustomDevicePassStrategy() : PassStrategy({}) { use_custom_device_ = true; }
+
+  /// \brief Construct by copying another CustomDevicePassStrategy object.
+  /// \param[in] other The CustomDevicePassStrategy object we want to copy.
+  explicit CustomDevicePassStrategy(const CustomDevicePassStrategy &other)
+      : PassStrategy(other.AllPasses()) {
+    use_custom_device_ = true;
+  }
+};
+
 /// \class IpuPassStrategy
 /// \brief The IPU passes controller, it is used in AnalysisPredictor with IPU
 /// mode.
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 3cd2df3aef639..11086b369fc15 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -54,7 +54,7 @@ enum DataType {
   // TODO(Superjomn) support more data types if needed.
 };
 
-enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU };
+enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU, kCUSTOM };
 
 enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW };
 
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
new file mode 100644
index 0000000000000..d88f282ce7a62
--- /dev/null
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/resource_manager.h"
+
+#include <unordered_map>
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/backends/gpu/forwards.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/generator.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace internal {
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+class EigenGpuStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenGpuStreamDevice() override {}
+
+  void Reinitialize(gpuStream_t cuda_stream, phi::Allocator* allocator,
+                    GPUPlace place) {
+    stream_ = cuda_stream;
+    allocator_ = allocator;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const gpuStream_t& stream() const override { return stream_; }
+
+  const gpuDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
+    auto buf = allocator_->Allocate(num_bytes);
+    VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested "
+            << num_bytes;
+    void* retv = buf->ptr();
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
+    return retv;
+  }
+
+  void deallocate(void* buffer) const override {
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#endif
+    }
+    return semaphore_;
+  }
+
+ private:
+  gpuStream_t stream_;                // not owned;
+  phi::Allocator* allocator_;         // not owned;
+  const gpuDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
+  mutable std::unordered_map<void*, phi::Allocator::AllocationPtr> allocations_;
+};
+#endif
+}  // namespace internal
+
+ResourceManager::ResourceManager(const phi::Place& place, void* stream)
+    : place_(place) {
+  InitCPUResource();
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  InitGPUResource(stream);
+#endif
+}
+
+ResourceManager::~ResourceManager() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  DestroyGPUResource();
+#endif
+}
+
+void ResourceManager::InitCPUResource() {
+  cpu_eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* ResourceManager::GetCpuEigenDevice() {
+  return cpu_eigen_device_.get();
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void ResourceManager::InitGPUResource(void* stream) {
+  if (stream == nullptr) {
+    owned_stream_ = true;
+    phi::InitStream(&stream_);
+  } else {
+    owned_stream_ = false;
+    stream_ = reinterpret_cast<gpuStream_t>(stream);
+  }
+
+  InitGpuProperties();
+  InitGpuEigenDevice();
+  InitDnnHanlde();
+  InitBlasHandle();
+  InitBlasLtHandle();
+  InitSolverHandle();
+  InitSparseHandle();
+}
+
+void ResourceManager::DestroyGPUResource() {
+  if (owned_stream_) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
+#endif
+    stream_ = nullptr;
+  }
+
+  DestroyDnnHandle();
+  DestroyBlasHandle();
+  DestroyBlasLtHandle();
+  DestroySolverHandle();
+  DestroySparseHandle();
+}
+
+void ResourceManager::InitGpuProperties() {
+  phi::backends::gpu::GPUDeviceGuard guard(place_.device);
+  phi::InitGpuProperties(place_, &compute_capability_, &runtime_version_,
+                         &driver_version_, &multi_process_,
+                         &max_threads_per_mp_, &max_threads_per_block_,
+                         &max_grid_dim_size_);
+}
+
+void ResourceManager::InitGpuEigenDevice() {
+  auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
+                        .GetAllocator(place_)
+                        .get();
+  eigen_stream_.reset(new internal::EigenGpuStreamDevice());
+  eigen_stream_->Reinitialize(stream_, allocator, place_);
+  gpu_eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+void ResourceManager::InitDnnHanlde() {
+  phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+}
+
+void ResourceManager::DestroyDnnHandle() { phi::DestroyDnnHandle(dnn_handle_); }
+
+void ResourceManager::InitBlasHandle() {
+  phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+  phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+      blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+  phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+      blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+}
+
+void ResourceManager::DestroyBlasHandle() {
+  phi::DestroyBlasHandle(blas_handle_);
+  phi::DestroyBlasHandle(blas_tensor_core_handle_);
+  phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
+}
+
+void ResourceManager::InitBlasLtHandle() {
+  phi::InitBlasLtHandle(&blaslt_handle_);
+}
+
+void ResourceManager::DestroyBlasLtHandle() {
+  phi::DestroyBlasLtHandle(blaslt_handle_);
+}
+
+void ResourceManager::InitSolverHandle() {
+  phi::InitSolverHandle(&solver_handle_, stream_);
+}
+
+void ResourceManager::DestroySolverHandle() {
+  phi::DestroySolverHandle(solver_handle_);
+}
+
+void ResourceManager::InitSparseHandle() {
+  phi::InitSparseHandle(&sparse_handle_, stream_);
+}
+
+void ResourceManager::DestroySparseHandle() {
+  phi::DestroySparseHandle(sparse_handle_);
+}
+
+gpuStream_t ResourceManager::GetStream() const { return stream_; }
+
+dnnHandle_t ResourceManager::GetDnnHandle() const { return dnn_handle_; }
+
+blasHandle_t ResourceManager::GetBlasHandle() const { return blas_handle_; }
+
+blasHandle_t ResourceManager::GetBlasTensorCoreHandle() const {
+  return blas_tensor_core_handle_;
+}
+
+blasHandle_t ResourceManager::GetBlasTF32Handle() const {
+  return blas_tf32_tensor_core_handle_;
+}
+
+blasLtHandle_t ResourceManager::GetBlasLtHandle() const {
+  return blaslt_handle_;
+}
+
+phi::solverHandle_t ResourceManager::GetSolverDnHandle() const {
+  return solver_handle_;
+}
+
+phi::sparseHandle_t ResourceManager::GetSparseHandle() const {
+  return sparse_handle_;
+}
+
+Eigen::GpuDevice* ResourceManager::GetGpuEigenDevice() const {
+  return gpu_eigen_device_.get();
+}
+
+int ResourceManager::GetGpuComputeCapability() const {
+  return compute_capability_;
+}
+
+int ResourceManager::GetGpuRuntimeVersion() const { return runtime_version_; }
+
+int ResourceManager::GetGpuDriverVersion() const { return driver_version_; }
+
+int ResourceManager::GetGPUMultiProcessors() const { return multi_process_; }
+
+int ResourceManager::GetGpuMaxThreadsPerMp() const {
+  return max_threads_per_mp_;
+}
+
+int ResourceManager::GetGpuMaxThreadsPerBlock() const {
+  return max_threads_per_block_;
+}
+
+std::array<int, 3> ResourceManager::GetGpuMaxGridDimSize() const {
+  return max_grid_dim_size_;
+}
+
+#endif
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
new file mode 100644
index 0000000000000..c41968dc58590
--- /dev/null
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <functional>
+#include <memory>
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/cpu/forwards.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/phi/backends/gpu/forwards.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+#endif
+
+namespace paddle {
+namespace internal {
+class EigenGpuStreamDevice;
+}  // namespace internal
+
+class ResourceManager {
+ public:
+  explicit ResourceManager(const phi::Place& place, void* stream);
+  ~ResourceManager();
+
+ public:
+  Eigen::DefaultDevice* GetCpuEigenDevice();
+
+ private:
+  void InitCPUResource();
+
+ private:
+  phi::Place place_;
+  std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+ public:
+  gpuStream_t GetStream() const;
+  dnnHandle_t GetDnnHandle() const;
+  blasHandle_t GetBlasHandle() const;
+  blasHandle_t GetBlasTensorCoreHandle() const;
+  blasHandle_t GetBlasTF32Handle() const;
+  blasLtHandle_t GetBlasLtHandle() const;
+  phi::solverHandle_t GetSolverDnHandle() const;
+  phi::sparseHandle_t GetSparseHandle() const;
+  Eigen::GpuDevice* GetGpuEigenDevice() const;
+  int GetGpuComputeCapability() const;
+  int GetGpuRuntimeVersion() const;
+  int GetGpuDriverVersion() const;
+  int GetGPUMultiProcessors() const;
+  int GetGpuMaxThreadsPerMp() const;
+  int GetGpuMaxThreadsPerBlock() const;
+  std::array<int, 3> GetGpuMaxGridDimSize() const;
+
+ private:
+  void InitGPUResource(void* stream);
+  void DestroyGPUResource();
+  void InitGpuProperties();
+  void InitGpuEigenDevice();
+  void InitDnnHanlde();
+  void DestroyDnnHandle();
+  void InitBlasHandle();
+  void DestroyBlasHandle();
+  void InitBlasLtHandle();
+  void DestroyBlasLtHandle();
+  void InitSolverHandle();
+  void DestroySolverHandle();
+  void InitSparseHandle();
+  void DestroySparseHandle();
+
+ private:
+  int compute_capability_;
+  int runtime_version_;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
+  int max_threads_per_block_;
+  std::array<int, 3> max_grid_dim_size_;
+
+  bool owned_stream_{true};
+  gpuStream_t stream_;
+  std::unique_ptr<Eigen::GpuDevice> gpu_eigen_device_;
+  std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_;
+
+  blasHandle_t blas_handle_{nullptr};
+  blasHandle_t blas_tensor_core_handle_{nullptr};
+  blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
+  dnnHandle_t dnn_handle_{nullptr};
+  phi::solverHandle_t solver_handle_{nullptr};
+  phi::sparseHandle_t sparse_handle_{nullptr};
+// DnnWorkspaceHandle
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map
index 5bb9b8d75620b..05935701635d9 100644
--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -6,4 +6,3 @@
 	local:
 		*;
 };
-
diff --git a/paddle/fluid/inference/paddle_inference_custom_device.map b/paddle/fluid/inference/paddle_inference_custom_device.map
new file mode 100644
index 0000000000000..52bc2870482e2
--- /dev/null
+++ b/paddle/fluid/inference/paddle_inference_custom_device.map
@@ -0,0 +1,10 @@
+{
+	global:
+		*paddle*;
+		*Pass*;
+		*profile*;
+		*phi*;
+		*FLAGS_*;
+	local:
+		*;
+};
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ec8c1b2fcd75c..22610ece34ee4 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,13 +1,42 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-           SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc
-                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc
-                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
+           SRCS matmul_op.cc
+                conv2d_op.cc
+                fc_op.cc
+                pool2d_op.cc
+                elementwise_op.cc
+                batch_norm_op.cc
+                activation_op.cc
+                unary_op.cc
+                softmax_op.cc
+                concat_op.cc
+                dropout_op.cc
+                group_norm_op.cc
+                pad_op.cc
+                split_op.cc
+                prelu_op.cc
+                leaky_relu_op.cc
+                gelu_op.cc
+                layer_norm_op.cc
+                multihead_matmul_op.cc
+                shuffle_channel_op.cc
+                swish_op.cc
+                instance_norm_op.cc
+                stack_op.cc
+                transpose_op.cc
+                flatten_op.cc
+                flatten_contiguous_range_op.cc
+                emb_eltwise_layernorm.cc
+                skip_layernorm.cc
+                scale_op.cc
+                slice_op.cc
+                hard_sigmoid_op.cc
+                hard_swish_op.cc
+                clip_op.cc
                 gather_op.cc
                 anchor_generator_op.cc
                 yolo_box_op.cc
+                yolo_box_head_op.cc
                 roi_align_op.cc
                 affine_channel_op.cc
                 multiclass_nms_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index b2e394d14eba2..0b9a6917dd972 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -52,7 +52,7 @@ class SwishOpConverter : public OpConverter {
     PADDLE_ENFORCE_EQ(
         output_num, 1UL,
         platform::errors::InvalidArgument(
-            "The ouput Out's size must equal to 1 in TRT swish op. "
+            "The output Out's size must equal to 1 in TRT swish op. "
             "But received Out's size %u.",
             output_num));
     // Get attrs
@@ -75,7 +75,7 @@ class SwishOpConverter : public OpConverter {
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta, with_fp16);
-      layer = engine_->AddPlugin(&input, input_num, plugin);
+      layer = engine_->AddPluginV2Ext(&input, input_num, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/unary_op.cc b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
new file mode 100644
index 0000000000000..aa3d38ebe2073
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <NvInfer.h>
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class UnaryOpConverter : public OpConverter {
+ public:
+  UnaryOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    VLOG(3) << "convert a fluid unary op to tensorrt unary layer whose "
+               "type is "
+            << op_type_;
+    nvinfer1::ITensor* input_tensor =
+        engine_->GetITensor(op_desc.Input("X")[0]);
+    auto op_pair = ops.find(op_type_);
+    nvinfer1::IUnaryLayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Unary, *input_tensor, op_pair->second);
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
+  }
+
+ protected:
+  std::string op_type_;
+  static const std::unordered_map<std::string, nvinfer1::UnaryOperation> ops;
+};
+
+const std::unordered_map<std::string, nvinfer1::UnaryOperation>
+    UnaryOpConverter::ops = {
+        {"exp", nvinfer1::UnaryOperation::kEXP},
+        {"log", nvinfer1::UnaryOperation::kLOG},
+};
+
+class ExpOpConverter : public UnaryOpConverter {
+ public:
+  ExpOpConverter() { op_type_ = "exp"; }
+};
+
+class LogOpConverter : public UnaryOpConverter {
+ public:
+  LogOpConverter() { op_type_ = "log"; }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(exp, ExpOpConverter);
+REGISTER_TRT_OP_CONVERTER(log, LogOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc
new file mode 100644
index 0000000000000..04276d94bf5e1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class YoloBoxHeadOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a yolo_box_head op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto* x_tensor = engine_->GetITensor(op_desc.Input("X").front());
+    std::vector<int> anchors =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("anchors"));
+    int class_num = BOOST_GET_CONST(int, op_desc.GetAttr("class_num"));
+
+    auto* yolo_box_plugin = new plugin::YoloBoxHeadPlugin(anchors, class_num);
+    std::vector<nvinfer1::ITensor*> yolo_box_inputs;
+    yolo_box_inputs.push_back(x_tensor);
+    auto* yolo_box_head_layer = engine_->network()->addPluginV2(
+        yolo_box_inputs.data(), yolo_box_inputs.size(), *yolo_box_plugin);
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Out").front());
+    RreplenishLayerAndOutput(yolo_box_head_layer, "yolo_box_head", output_names,
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(yolo_box_head, YoloBoxHeadOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index b44450e7a8212..ba5b28a4dfed9 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -65,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_fusion",
       "pool2d",
       "relu",
+      "exp",
+      "log",
       "softmax",
       "sigmoid",
       "hard_swish",
@@ -98,6 +100,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "gather",
       "gather_nd",
       "yolo_box",
+      "yolo_box_head",
       "roi_align",
       "affine_channel",
       "nearest_interp",
@@ -128,6 +131,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_fusion",
       "pool2d",
       "relu",
+      "exp",
+      "log",
       "softmax",
       "sigmoid",
       "hard_swish",
@@ -161,6 +166,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "gather",
       "gather_nd",
       "yolo_box",
+      "yolo_box_head",
       "roi_align",
       "affine_channel",
       "nearest_interp",
@@ -200,7 +206,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
 
   for (auto& teller : tellers_) {
     if (op_type == "relu" || op_type == "relu6" || op_type == "tanh" ||
-        op_type == "sigmoid") {
+        op_type == "sigmoid" || op_type == "exp" || op_type == "log") {
       auto* block = desc.Block();
       if (block == nullptr) {
         VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
@@ -630,6 +636,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!has_attrs) return false;
     }
 
+    if (op_type == "yolo_box_head") {
+      if (with_dynamic_shape) return false;
+      bool has_attrs = desc.HasAttr("class_num") && desc.HasAttr("anchors");
+      if (!has_attrs) return false;
+    }
+
     if (op_type == "affine_channel") {
       if (!desc.HasAttr("data_layout")) return false;
       auto data_layout = framework::StringToDataLayout(
@@ -941,6 +953,11 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "strided_slice") {
+#if !IS_TRT_VERSION_GE(7000)
+      VLOG(3)
+          << "strided_slice converter does not support trt versions below 7.0";
+      return false;
+#endif
       if (!with_dynamic_shape) {
         return false;
       }
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index be6984d0f76b5..ff6a1cd60f720 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -7,6 +7,7 @@ nv_library(tensorrt_plugin
            hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
            anchor_generator_op_plugin.cu
            yolo_box_op_plugin.cu
+           yolo_box_head_op_plugin.cu
            roi_align_op_plugin.cu
            gather_nd_op_plugin.cu
            mish_op_plugin.cu
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
index 9720719fd0bca..2c2fad74b9a2d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
@@ -24,6 +24,16 @@ namespace tensorrt {
 namespace plugin {
 
 int SwishPlugin::initialize() TRT_NOEXCEPT { return 0; }
+void SwishPlugin::terminate() TRT_NOEXCEPT {}
+
+bool SwishPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
+  if (with_fp16_) {
+    return type == nvinfer1::DataType::kFLOAT ||
+           type == nvinfer1::DataType::kHALF;
+  }
+  return type == nvinfer1::DataType::kFLOAT;
+}
 
 nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims *inputDims,
@@ -85,17 +95,29 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
                          void *const *outputs, void *workspace,
                          cudaStream_t stream) TRT_NOEXCEPT {
 #endif
-  // input dims is CHW.
   const auto &input_dims = this->getInputDims(0);
-  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  float *output = reinterpret_cast<float *const *>(outputs)[0];
   int num = batch_size;
   for (int i = 0; i < input_dims.nbDims; i++) {
     num *= input_dims.d[i];
   }
   int threads = 1024;
   int blocks = (num + threads - 1) / threads;
-  swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
+  auto type = getDataType();
+  if (type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32";
+    const float *input = reinterpret_cast<const float *>(inputs[0]);
+    float *output = reinterpret_cast<float *const *>(outputs)[0];
+    swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
+  } else if (type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16";
+    const half *input = reinterpret_cast<const half *>(inputs[0]);
+    half *output = reinterpret_cast<half *const *>(outputs)[0];
+    swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output,
+                                                 (half)beta_);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The Swish TRT Plugin's input type should be float or half."));
+  }
 
   return cudaGetLastError() != cudaSuccess;
 }
@@ -140,12 +162,15 @@ bool SwishPluginDynamic::supportsFormatCombination(
   const nvinfer1::PluginTensorDesc &in = in_out[pos];
   if (pos == 0) {
     if (with_fp16_) {
-      return (in.type == nvinfer1::DataType::kFLOAT ||
-              in.type == nvinfer1::DataType::kHALF) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      bool res = (in.type == nvinfer1::DataType::kFLOAT ||
+                  in.type == nvinfer1::DataType::kHALF);
+// encounter trt crash bug
+#if IS_TRT_VERSION_LT(8000)
+      res = res && (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+      return res;
     } else {
-      return (in.type == nvinfer1::DataType::kFLOAT) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      return in.type == nvinfer1::DataType::kFLOAT;
     }
   }
   const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
index c4bdc5f921509..aa8fdce23fa89 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
@@ -26,7 +26,7 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-class SwishPlugin : public PluginTensorRT {
+class SwishPlugin : public PluginTensorRTV2Ext {
  private:
   float beta_;
 
@@ -55,13 +55,24 @@ class SwishPlugin : public PluginTensorRT {
 
   int initialize() TRT_NOEXCEPT override;
 
-  SwishPlugin* clone() const TRT_NOEXCEPT override {
-    return new SwishPlugin(beta_, with_fp16_);
+  nvinfer1::IPluginV2Ext* clone() const TRT_NOEXCEPT override {
+    auto* plugin = new SwishPlugin(beta_, with_fp16_);
+    plugin->data_format_ = data_format_;
+    plugin->data_type_ = data_type_;
+    plugin->input_dims_ = input_dims_;
+    return plugin;
   }
 
   const char* getPluginType() const TRT_NOEXCEPT override {
     return "swish_plugin";
   }
+
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override {
+    return input_types[0];
+  }
+
   int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                      int nbInputDims) TRT_NOEXCEPT override;
@@ -71,6 +82,12 @@ class SwishPlugin : public PluginTensorRT {
   int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
 #endif
               void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  void terminate() TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override { delete this; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "2"; }
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
 };
 
 class SwishPluginCreator : public TensorRTPluginCreator {
@@ -79,7 +96,7 @@ class SwishPluginCreator : public TensorRTPluginCreator {
     return "swish_plugin";
   }
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "2"; }
 
   nvinfer1::IPluginV2* deserializePlugin(
       const char* name, const void* serial_data,
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.cu
new file mode 100644
index 0000000000000..755bb5aa28572
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.cu
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+inline __device__ float SigmoidGPU(const float& x) {
+  return 1.0f / (1.0f + __expf(-x));
+}
+
+__global__ void YoloBoxHeadKernel(const float* input, float* output,
+                                  const int grid_size_x, const int grid_size_y,
+                                  const int class_num, const int anchors_num) {
+  int x_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int z_id = blockIdx.z * blockDim.z + threadIdx.z;
+  if ((x_id >= grid_size_x) || (y_id >= grid_size_y) || (z_id >= anchors_num)) {
+    return;
+  }
+  const int grids_num = grid_size_x * grid_size_y;
+  const int bbindex = y_id * grid_size_x + x_id;
+
+  // objectness
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 4)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 4)]);
+  // x
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 0)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 0)]);
+  // y
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 1)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 1)]);
+  // w
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 2)] =
+      __expf(input[bbindex + grids_num * (z_id * (5 + class_num) + 2)]);
+  // h
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 3)] =
+      __expf(input[bbindex + grids_num * (z_id * (5 + class_num) + 3)]);
+  // Probabilities of classes
+  for (int i = 0; i < class_num; ++i) {
+    output[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))] =
+        SigmoidGPU(
+            input[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))]);
+  }
+}
+
+int YoloBoxHeadPlugin::enqueue(int batch_size, const void* const* inputs,
+#if IS_TRT_VERSION_LT(8000)
+                               void** outputs,
+#else
+                               void* const* outputs,
+#endif
+                               void* workspace,
+                               cudaStream_t stream) TRT_NOEXCEPT {
+  const int h = input_dims_[0].d[1];
+  const int w = input_dims_[0].d[2];
+  const int grid_size_x = w;
+  const int grid_size_y = h;
+  const int anchors_num = anchors_.size() / 2;
+  const float* input_data = static_cast<const float*>(inputs[0]);
+  float* output_data = static_cast<float*>(outputs[0]);
+  const int volume = input_dims_[0].d[0] * h * w;
+  dim3 block(16, 16, 4);
+  dim3 grid((grid_size_x / block.x) + 1, (grid_size_y / block.y) + 1,
+            (anchors_num / block.z) + 1);
+  for (int n = 0; n < batch_size; n++) {
+    YoloBoxHeadKernel<<<grid, block, 0, stream>>>(
+        input_data + n * volume, output_data + n * volume, grid_size_x,
+        grid_size_y, class_num_, anchors_num);
+  }
+  return 0;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
new file mode 100644
index 0000000000000..2094dbfc9db4b
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class YoloBoxHeadPlugin : public PluginTensorRT {
+ public:
+  explicit YoloBoxHeadPlugin(const std::vector<int>& anchors,
+                             const int class_num)
+      : anchors_(anchors), class_num_(class_num) {}
+
+  YoloBoxHeadPlugin(const void* data, size_t length) {
+    deserializeBase(data, length);
+    DeserializeValue(&data, &length, &anchors_);
+    DeserializeValue(&data, &length, &class_num_);
+  }
+
+  ~YoloBoxHeadPlugin() override{};
+
+  nvinfer1::IPluginV2* clone() const TRT_NOEXCEPT override {
+    return new YoloBoxHeadPlugin(anchors_, class_num_);
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "yolo_box_head_plugin";
+  }
+
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+
+  int initialize() TRT_NOEXCEPT override { return 0; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) TRT_NOEXCEPT override {
+    assert(index == 0);
+    assert(nb_input_dims == 1);
+    return inputs[0];
+  }
+
+  int enqueue(int batch_size, const void* const* inputs,
+#if IS_TRT_VERSION_LT(8000)
+              void** outputs,
+#else
+              void* const* outputs,
+#endif
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
+    return getBaseSerializationSize() + SerializedSize(anchors_) +
+           SerializedSize(class_num_);
+  }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, anchors_);
+    SerializeValue(&buffer, class_num_);
+  }
+
+ private:
+  std::vector<int> anchors_;
+  int class_num_;
+  std::string namespace_;
+};
+
+class YoloBoxHeadPluginCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "yolo_box_head_plugin";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    return new YoloBoxHeadPlugin(serial_data, serial_length);
+  }
+};
+
+REGISTER_TRT_PLUGIN_V2(YoloBoxHeadPluginCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
index cca8ac2634c6c..141e60513eb95 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -147,10 +147,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
   file.read(reinterpret_cast<char *>(&total_words_num), sizeof(int64_t));
   LOG(INFO) << "Total words in file: " << total_words_num;
   size_t lods_beginning_offset = static_cast<size_t>(file.tellg());
-  auto words_begining_offset =
+  auto words_beginning_offset =
       lods_beginning_offset + sizeof(size_t) * total_sentences_num;
   auto targets_beginning_offset =
-      words_begining_offset + sizeof(int64_t) * total_words_num;
+      words_beginning_offset + sizeof(int64_t) * total_words_num;
 
   std::vector<size_t> lod_full =
       ReadSentenceLod(file, lods_beginning_offset, total_sentences_num);
@@ -158,7 +158,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
   size_t lods_sum = std::accumulate(lod_full.begin(), lod_full.end(), 0UL);
   EXPECT_EQ(lods_sum, static_cast<size_t>(total_words_num));
 
-  TensorReader<int64_t> words_reader(file, words_begining_offset, "words");
+  TensorReader<int64_t> words_reader(file, words_beginning_offset, "words");
   TensorReader<int64_t> targets_reader(file, targets_beginning_offset,
                                        "targets");
   // If FLAGS_iterations is set to 0, run all batches
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index 0906567dbf6c1..b4850a8e9e919 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -80,8 +80,8 @@ class Stat : public StatBase {
       while (prev_value < current_value &&
              !peak_value_.compare_exchange_weak(prev_value, current_value)) {
       }
-      VLOG(8) << "Update peak_value, after update, peak_value = " << peak_value_
-              << " , current value = " << current_value;
+      VLOG(8) << "Update peak_value, after update, peak_value = "
+              << peak_value_.load() << " , current value = " << current_value;
     }
   }
 
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 5448ed2a4bdad..8214b733f86da 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -634,7 +634,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
 
 // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
 // DOut(dy) as input(not output), tensor extraction is different from
-// others. Impliment extraction kernel seperately here.
+// others. Impliment extraction kernel separately here.
 inline void ExtractDoubleGradTensorWithInputDOut(
     const framework::ExecutionContext& ctx, const framework::Tensor** X,
     const framework::Tensor** ddX, framework::Tensor** dX,
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index efaea94f26e8d..e311d21bb54d3 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -136,7 +136,7 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra();
     AddAttr<bool>("align_corners",
                   "(bool, default false) Whether to align the corners of input"
-                  "and ouput.")
+                  "and output.")
         .SetDefault(true);
     AddAttr<std::vector<int>>(
         "output_shape",
diff --git a/paddle/fluid/operators/assign_op_mlu.cc b/paddle/fluid/operators/assign_op_mlu.cc
new file mode 100644
index 0000000000000..85092c516955d
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_mlu.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/operators/assign_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class AssignMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Assign(ctx, x_desc.get(), GetBasePtr(x), out_desc.get(),
+                    GetBasePtr(out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(assign, ops::AssignMLUKernel<int>,
+                       ops::AssignMLUKernel<float>,
+                       ops::AssignMLUKernel<plat::float16>,
+                       ops::AssignMLUKernel<bool>)
diff --git a/paddle/fluid/operators/assign_value_op_mlu.cc b/paddle/fluid/operators/assign_value_op_mlu.cc
new file mode 100644
index 0000000000000..651e129ccb17a
--- /dev/null
+++ b/paddle/fluid/operators/assign_value_op_mlu.cc
@@ -0,0 +1,22 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/assign_value_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(assign_value, ops::AssignValueKernel<bool>,
+                       ops::AssignValueKernel<int>,
+                       ops::AssignValueKernel<int64_t>,
+                       ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 36a0d53e05245..2663a08101157 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -64,7 +64,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
         (x_dims[i] == -1) || (x_dims[i] > 0), true,
         platform::errors::InvalidArgument(
             "Each dimension of input tensor is expected to be -1 or a "
-            "positive number, but recieved %d. Input's shape is [%s].",
+            "positive number, but received %d. Input's shape is [%s].",
             x_dims[i], x_dims));
   }
 
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index bc6cf9d831ff0..76e0f23df2168 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -156,7 +156,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::CastOpKernel<CPU, double>, ops::CastOpKernel<CPU, int>,
     ops::CastOpKernel<CPU, int64_t>, ops::CastOpKernel<CPU, int>,
     ops::CastOpKernel<CPU, int16_t>, ops::CastOpKernel<CPU, bool>,
-    ops::CastOpKernel<CPU, uint8_t>,
+    ops::CastOpKernel<CPU, uint8_t>, ops::CastOpKernel<CPU, int8_t>,
     ops::CastOpKernel<CPU, paddle::platform::float16>,
     ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
     ops::CastOpKernel<CPU, paddle::platform::complex<float>>,
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index 5d006a947be19..0a9b66bc92c15 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -136,7 +136,7 @@ class CinnLaunchOpMaker : public framework::OpProtoAndCheckerMaker {
               "(vector<LoDTensor>)"
               "which are the output of graph inside the CinnLaunchOp.")
         .AsDuplicable();
-    AddAttr<std::string>(
+    AddAttr<int64_t>(
         kCompilationKey,
         "(string)"
         "a hash key used to get the graph object or its computation result.");
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 024bf2bceb3d0..f40b788dfb5b3 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <chrono>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -27,6 +28,7 @@
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(enable_pe_launch_cinn);
 namespace paddle {
@@ -60,13 +62,14 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     const auto& scope = ctx.scope();
     const auto& place = ctx.GetPlace();
     void* stream = details::GetStream<DeviceContext>(ctx);
+    platform::RecordEvent record_event_1(
+        "Step 1. Find graph object and prepare input");
     // Step 1. Find graph object and prepare input
     PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey), true,
                       platform::errors::NotFound(
                           "No Attribute(%s) found for CinnLaunchOp operator.",
                           kCompilationKey));
-    const auto& compilation_key =
-        ctx.template Attr<std::string>(kCompilationKey);
+    const auto& compilation_key = ctx.template Attr<int64_t>(kCompilationKey);
     VLOG(4) << "CinnLaunchOp attribute(" << kCompilationKey << ") "
             << "value:\n"
             << CinnCompiler::GetInstance()->ReadableKey(compilation_key);
@@ -99,24 +102,44 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
                          input_no_need_buffer_tensors);
     }
 
+    platform::RecordEvent record_event_2(
+        "Step 2. Get compilation result of the graph");
     // Step 2. Get compilation result of the graph
     auto target = details::PlaceToCinnTarget(place);
+    using ClockType = std::chrono::steady_clock;
+    std::chrono::time_point<ClockType> start_t, end_t;
+    if (VLOG_IS_ON(1)) {
+      VLOG(1) << "Starts to compile at thread " << std::this_thread::get_id();
+      start_t = ClockType::now();
+    }
     const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
         compilation_key, inputs_name2tensor, target, stream);
+    if (VLOG_IS_ON(1)) {
+      end_t = ClockType::now();
+      auto time_sec = std::chrono::duration_cast<std::chrono::milliseconds>(
+          end_t - start_t);
+      VLOG(1) << "Ends to compile at thread " << std::this_thread::get_id()
+              << " , time cost : " << time_sec.count() << " ms";
+    }
     details::DebugCinnCompiledResult(cinn_compiled_object);
     auto* launch_context = cinn_compiled_object.launch_context.get();
 
+    platform::RecordEvent record_event_3("Step 3. Set CINN runtime FLAGS.");
     // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
     // Step 4. Execute the compiled CINN instructions by a PE or
     //         by the CINN compiled program in sequential order
     if (FLAGS_enable_pe_launch_cinn) {
+      platform::RecordEvent record_event_4(
+          "Step 4. Execute the runtime graph by PE.");
       VLOG(4) << "Execute the runtime graph by PE";
       framework::Scope& exec_scope = scope.NewScope();
       auto* pe = launch_context->InitializePE(place, &exec_scope);
       pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
     } else {
+      platform::RecordEvent record_event_4(
+          "Step 4. Execute the compiled executable program.");
       VLOG(4) << "Execute the compiled executable program";
       launch_context->UpdateCapturedEnv(scope, place);
       LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index 4bed282ace8d1..eeae16a0d71f3 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -77,7 +77,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclBcast(out->mutable_data<T>(place), numel,
                                        dtype, root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
               << phi::product(out->dims());
     }
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index d1e269fb5a4fe..8f07480aaab14 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -62,7 +62,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(out->mutable_data<T>(place), numel,
                                            dtype, root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
               << phi::product(out->dims());
     }
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
index 31961d8a246a9..a065e49ff72be 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -59,7 +59,7 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
         ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
 
-    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
             << phi::product(out->dims());
 
     dev_ctx->Wait();
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 96b27a833fba3..7a2a802382f6c 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -122,4 +122,5 @@ REGISTER_OP_CUDA_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel<float>,
                         ops::RecvOpV2CUDAKernel<double>,
                         ops::RecvOpV2CUDAKernel<int>,
                         ops::RecvOpV2CUDAKernel<int64_t>,
+                        ops::RecvOpV2CUDAKernel<int8_t>,
                         ops::RecvOpV2CUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index add352306fa28..57a3fe2e45d7e 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -109,4 +109,5 @@ REGISTER_OP_CUDA_KERNEL(send_v2, ops::SendOpV2CUDAKernel<float>,
                         ops::SendOpV2CUDAKernel<double>,
                         ops::SendOpV2CUDAKernel<int>,
                         ops::SendOpV2CUDAKernel<int64_t>,
+                        ops::SendOpV2CUDAKernel<int8_t>,
                         ops::SendOpV2CUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index 2d7382f3dfd70..882630467a012 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -41,7 +41,6 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
       // Use ProcessGroup
       distributed::ProcessGroup* pg = map->get(ring_id);
       std::vector<phi::DenseTensor> in_tensor;
-      auto x = ctx.Input<framework::LoDTensor>("X");
       in_tensor.push_back(*x);
       auto task = pg->Send(in_tensor, 1);
       return;
diff --git a/paddle/fluid/operators/compat/fill_constant.pbtxt b/paddle/fluid/operators/compat/fill_constant.pbtxt
index 26fecf623c19c..62701eeb396da 100644
--- a/paddle/fluid/operators/compat/fill_constant.pbtxt
+++ b/paddle/fluid/operators/compat/fill_constant.pbtxt
@@ -58,4 +58,8 @@ extra {
     name: "op_device"
     type: STRING
   }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
 }
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index a5d888765bf37..58f2eeee256db 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -43,7 +43,7 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
       output_size, 0,
       platform::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
           "stride + 1), where input_size is %d, padding is %d, "
           "filter_size is %d, dilation is %d, stride is %d.",
@@ -60,7 +60,7 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
       output_size, 0,
       platform::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + padding_1 + padding_2 - (dilation * (filter_size - "
           "1) + 1)) / stride + 1), where input_size is %d, padding is "
           "(%d, %d), filter_size is %d, dilation is %d, stride is %d.",
@@ -90,7 +90,7 @@ inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
         platform::errors::InvalidArgument(
             "Attribute padding's size should be the same or twice as the "
             "input's dimension. "
-            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "But received: padding's size is %d, padding is [%s]; input's "
             "dimension is %d, input's shape is [%s].",
             paddings->size(), phi::make_ddim(*paddings), data_dims.size(),
             data_dims));
diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc
index 1ee772ec72950..c1517dbe16f84 100644
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -98,7 +98,7 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
         output_desc.get(), GetBasePtr(&output_tensor));
 
     if (!channel_last) {
-      // transpose ouput from NHWC to NCHW
+      // transpose output from NHWC to NCHW
       const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
       TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &output_tensor, output,
                                 false /*need_reshape_or_alloc*/);
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index b1f2e61ef3930..ba90c677570c5 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -26,19 +26,19 @@ __global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
                                       const size_t num_seq, size_t* lod0,
                                       const int blank, const int merge_repeated,
                                       size_t* out_lod0, T* output) {
-  int ouput_idx = 0;
+  int output_idx = 0;
   out_lod0[0] = 0;
 
   for (int i = 0; i < num_seq; ++i) {
     T pre_token = -1;
     for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
       if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
-        output[ouput_idx] = tokens[j];
-        ++ouput_idx;
+        output[output_idx] = tokens[j];
+        ++output_idx;
       }
       pre_token = tokens[j];
     }
-    out_lod0[i + 1] = ouput_idx;
+    out_lod0[i + 1] = output_idx;
   }
 }
 
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 448f67a4bad7a..873950b2d2f65 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -200,7 +200,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
         num_rois, out->dims()[0],
         platform::errors::InvalidArgument(
             "The number of Input(ROIs) should be same with the number of "
-            "Ouput(Output), but received ROIs number is:%d, Output number "
+            "Output(Output), but received ROIs number is:%d, Output number "
             "is:%d.",
             num_rois, out->dims()[0]));
     const int count = num_rois * output_dim * pooled_height * pooled_width;
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 51a0fe4172ca2..3deabce54ed0b 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -175,7 +175,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
         num_rois, out->dims()[0],
         platform::errors::InvalidArgument(
             "The number of Input(ROIs) should be same with the number of "
-            "Ouput(Output), but received ROIs number is:%d, Output number "
+            "Output(Output), but received ROIs number is:%d, Output number "
             "is:%d.",
             num_rois, out->dims()[0]));
     framework::Tensor roi_batch_id_list;
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 713c2dc7fe9c1..3353739b01bf6 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -385,7 +385,7 @@ independently for each class. The outputs is a 2-D LoDTenosr, for each
 image, the offsets in first dimension of LoDTensor are called LoD, the number
 of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
 means there is no detected bbox for this image. Now this operator has one more
-ouput, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
+output, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
 detected bbox for this image.
 
 For more information on Matrix NMS, please refer to:
diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc
index b88974a51ceff..f4dbbae05532e 100644
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -82,7 +82,7 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
             *x, ctx.GetPlace(),
             ctx.template device_context<platform::MLUDeviceContext>(), out);
       } else {
-        float scale = static_cast<T>(1.0f - dropout_prob);
+        auto scale = static_cast<T>(1.0f - dropout_prob);
         Tensor scale_tensor(x->dtype());
         scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
         MLUCnnlTensorDesc scale_desc(scale_tensor);
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 07b3b53811625..104ab1b504640 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -54,7 +54,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    // only achive the default `upscale_in_train` method
+    // only achieve the default `upscale_in_train` method
     if (!is_test) {
       Tensor tmp_x(x->dtype());
       Tensor tmp_out(out->dtype());
diff --git a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
new file mode 100644
index 0000000000000..e003a43b5c56b
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseHeavisideOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Heaviside"; }
+  std::string GetEquation() const override { return "Out = Heaviside(X, Y)"; }
+
+  void AddInputX() override {
+    AddInput("X",
+             "(Tensor), The input tensor of Heaviside step function. "
+             "Its dtype can be int32, int64, float32 and float64");
+  }
+
+  void AddInputY() override {
+    AddInput("Y",
+             "(Tensor), The tensor determining a Heaviside step function, "
+             "which is the value when X = 0. Its dtype should be same as X.");
+  }
+
+  std::string GetOpFuntionality() const override {
+    return "Computes the Heaviside step function determined by Y "
+           "for each element in X.";
+  }
+};
+
+template <typename T>
+class ElementwiseHeavisideGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("elementwise_heaviside_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    elementwise_heaviside, ops::ElementwiseOp, ops::ElementwiseHeavisideOpMaker,
+    ops::ElementwiseHeavisideGradOpMaker<paddle::framework::OpDesc>,
+    ops::ElementwiseHeavisideGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(elementwise_heaviside_grad, ops::ElementwiseOpGrad);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index 156cea81c0f63..ff1e12103be91 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -165,7 +165,7 @@ template <UNARY_FUNCTOR func>
 void MLUUnary(const framework::ExecutionContext& ctx,
               cnnlComputationPreference_t prefer,
               const cnnlTensorDescriptor_t input_desc, const void* input,
-              const cnnlTensorDescriptor_t ouput_desc, void* output);
+              const cnnlTensorDescriptor_t output_desc, void* output);
 
 template <>
 inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 33518953004ae..6e646f0d4bf26 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -75,7 +75,7 @@ class FCOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute in_num_col_dims used to flatten Input to "
             "a 2-D tensor, is expected to be less than the number of "
-            "Input's dimensions. But recieved in_num_col_dims is %d, "
+            "Input's dimensions. But received in_num_col_dims is %d, "
             "the number of Input's dimensions is %d, Input's shape is %s.",
             in_num_col_dims, in_dims.size(), in_dims));
 
@@ -93,7 +93,7 @@ class FCOp : public framework::OperatorWithKernel {
           in_dims.size() >= 2 && in_dims.size() <= 4, true,
           platform::errors::Unimplemented(
               "The Input of fc is expected to be a 2-D, 3-D or 4-D tensor when "
-              "use_mkldnn is set. But recieved the number of Input's "
+              "use_mkldnn is set. But received the number of Input's "
               "dimensions is %d, Input's shape is %s.",
               in_dims.size(), in_dims));
     }
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 6d3b531ce0aa6..47c7128603587 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -36,7 +36,7 @@ inline void FCOutputSize(const framework::DDim& in_dims,
       in_mat_dims[1], w_dims0,
       platform::errors::InvalidArgument(
           "The input's second dimension and weight's first dimension is "
-          "expected to be the same. But recieved input's second dimension is "
+          "expected to be the same. But received input's second dimension is "
           "%d, input's shape is %s; weight's first dimension is %d, weight's "
           "shape is %s.",
           in_mat_dims[1], in_mat_dims, w_dims0,
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index ca46a3db1ecd5..07593a70f05b7 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -22,17 +22,17 @@ class FillConstantOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FillConstant");
 
-    auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) {
       for (size_t i = 0; i < shape.size(); ++i) {
         PADDLE_ENFORCE_GE(
             shape[i], 0,
             platform::errors::InvalidArgument(
                 "Each value of attribute 'shape' is expected to be no less "
-                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+                "than 0. But received: shape[%u] = %d; shape = [%s].",
                 i, shape[i], phi::make_ddim(shape)));
       }
     }
@@ -52,8 +52,8 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
       return expected_kernel_type;
     } else {
@@ -63,7 +63,7 @@ class FillConstantOp : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
@@ -97,13 +97,24 @@ class FillConstantOp : public framework::OperatorWithKernel {
       }
     }
 
+#ifdef PADDLE_WITH_MKLDNN
+    auto input_data_type =
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype"));
+
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+
     return kt;
   }
 };
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
+  void operator()(framework::InferVarTypeContext *ctx) const override {
     auto data_type = static_cast<framework::proto::VarType::Type>(
         BOOST_GET_CONST(int, ctx->GetAttr("dtype")));
     ctx->SetOutputDataType("Out", data_type);
@@ -156,6 +167,10 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
                  "3: XPUPlace. "
                  "4: NPUPlace. ")
         .SetDefault(-1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false)
+        .AsExtra();
     AddOutput("Out",
               "(Tensor) Tensor of specified shape will be filled "
               "with the specified value");
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 92f59e118c3b7..9c9183c8fafa4 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -76,47 +76,47 @@ class FoldOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(kernel_height, 0,
                       platform::errors::InvalidArgument(
                           "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
+                          "but received kernel_height: %d kernel_width: %d.",
                           kernel_sizes[0], kernel_sizes[1]));
     PADDLE_ENFORCE_GT(kernel_width, 0,
                       platform::errors::InvalidArgument(
                           "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
+                          "but received kernel_height: %d kernel_width: %d.",
                           kernel_sizes[0], kernel_sizes[1]));
     // check strides
     PADDLE_ENFORCE_GT(stride_height, 0,
                       platform::errors::InvalidArgument(
                           "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
+                          "but received strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     PADDLE_ENFORCE_GT(stride_width, 0,
                       platform::errors::InvalidArgument(
                           "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
+                          "but received strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     // check dilations
     PADDLE_ENFORCE_GT(output_height, 1,
                       platform::errors::InvalidArgument(
                           "The `output_height` should be greater than one, "
-                          "but recieved output_height: %d .",
+                          "but received output_height: %d .",
                           output_height));
     PADDLE_ENFORCE_GT(output_width, 1,
                       platform::errors::InvalidArgument(
                           "The `output_width` should be greater than one, "
-                          "but recieved output_width: %d .",
+                          "but received output_width: %d .",
                           output_width));
     // check output size
     PADDLE_ENFORCE_GT(
         dilation_height, 0,
         platform::errors::InvalidArgument(
             "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
+            "but received dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
     PADDLE_ENFORCE_GT(
         dilation_width, 0,
         platform::errors::InvalidArgument(
             "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
+            "but received dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
 
     std::vector<int> out_dims;
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 68b9051d85831..03351dbca09e5 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -11,6 +11,8 @@ register_operators(EXCLUDES
     fused_fc_elementwise_layernorm_op
     multihead_matmul_op
     skip_layernorm_op
+    yolo_box_head_op
+    yolo_box_post_op
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
     fusion_gru_op
@@ -53,6 +55,8 @@ if (WITH_GPU OR WITH_ROCM)
     # multihead_matmul_op
     op_library(multihead_matmul_op)
     op_library(skip_layernorm_op)
+    op_library(yolo_box_head_op)
+    op_library(yolo_box_post_op)
     op_library(fused_embedding_eltwise_layernorm_op)
     # fusion_group
     if(NOT APPLE AND NOT WIN32)
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 3a2de0c4a0935..b059223eaf6e7 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -51,8 +51,7 @@ template <typename InT, typename OutT, int ShapeSize, int VecSize,
 __global__ void BroadcastKernelBinary(
     const InT* __restrict__ in0, const InT* __restrict__ in1, OutT* out,
     phi::Array<bool, MAX_INPUT_NUM> use_broadcast, uint32_t numel,
-    phi::Array<kps::details::BroadcastConfig<ShapeSize>, MAX_INPUT_NUM>
-        configlists,
+    phi::Array<kps::details::BroadcastConfig, MAX_INPUT_NUM> configlists,
     int main_tid, int tail_tid, Functor func) {
   int fix = blockIdx.x * blockDim.x * VecSize;
   int num = tail_tid;
@@ -65,14 +64,14 @@ __global__ void BroadcastKernelBinary(
 
   // load in0
   if (use_broadcast[0]) {
-    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
+    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
         arg0, in0, fix, configlists[0], numel);
   } else {
     kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num);
   }
   // load in1
   if (use_broadcast[1]) {
-    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
+    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
         arg1, in1, fix, configlists[1], numel);
   } else {
     kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num);
@@ -104,7 +103,7 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
   int main_tid = numel / (data_per_thread * vec_size * threads);
   int tail_tid = numel % (data_per_thread * vec_size * threads);
 
-  phi::Array<kps::details::BroadcastConfig<2>, MAX_INPUT_NUM> configlists;
+  phi::Array<kps::details::BroadcastConfig, MAX_INPUT_NUM> configlists;
   phi::Array<bool, MAX_INPUT_NUM> use_broadcast;
 
   use_broadcast[0] = false;
@@ -115,7 +114,7 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
   // Here, dims are transposed due to the logic in BroadcastConfig.
   std::vector<int64_t> input1_dims = {n, 1};
   std::vector<int64_t> out_dims = {n, m};
-  configlists[1] = kps::details::BroadcastConfig<2>(out_dims, input1_dims, 2);
+  configlists[1] = kps::details::BroadcastConfig(out_dims, input1_dims, 2);
 
   auto func = AddFunctor<T>();
   auto stream = ctx.stream();
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index e60fc44e9a6ff..671e94061cb5c 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -80,7 +80,7 @@ class Conv2DFusionOp : public operators::ConvOp {
         data_format, "NHWC",
         platform::errors::PermissionDenied(
             "Operator(Conv2DFusion) only supports data format of "
-            "channel first (NCHW) now. But recieved: data_format = '%s'.",
+            "channel first (NCHW) now. But received: data_format = '%s'.",
             data_format));
 
     std::vector<int64_t> output_shape = ComputeOutputShape(ctx);
@@ -113,7 +113,7 @@ class Conv2DFusionOp : public operators::ConvOp {
           split_channels_sum, output_shape[1],
           platform::errors::InvalidArgument(
               "The sum of Attr(split_channels) is expected to be equal to the "
-              "total output channels. But recieved: the sum of "
+              "total output channels. But received: the sum of "
               "Attr(split_channels) = %d, the total output channels = %d.",
               split_channels_sum, output_shape[1]));
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 5dbf4fb88b2a7..8191c85f2a120 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -130,7 +130,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         default:
           PADDLE_THROW(platform::errors::PermissionDenied(
               "Operator Conv2DFusion expects Input to be a 4-D or 5-D Tensor. "
-              "But recieved the actual dimension = %d, shape = [%s].",
+              "But received the actual dimension = %d, shape = [%s].",
               rank, transformed_input_channel.dims()));
       }
 
@@ -355,7 +355,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         workspace_size_in_bytes, workspace_size_limit,
         platform::errors::InvalidArgument(
             "The actual workspace size to be allocated for cuDNN is expected "
-            "to be less than the limit. But recieved: the actual workspace "
+            "to be less than the limit. But received: the actual workspace "
             "size = %d, limit = %d.",
             workspace_size_in_bytes, workspace_size_limit));
 
@@ -414,7 +414,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       } else {
         // TODO(qingiqng): do copy when batch size large than 1
         PADDLE_THROW(platform::errors::Unimplemented(
-            "Input with batch size greater than 1 is unsupported. The recieved "
+            "Input with batch size greater than 1 is unsupported. The received "
             "batch size is %d, Input's shape is [%s].",
             x_dims[0], phi::make_ddim(x_dims)));
       }
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 74cc92eb8ab62..4b3ed56890e18 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -40,7 +40,7 @@ struct BNStatsFinalizeArgs {
     PADDLE_ENFORCE_EQ(
         param_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of param_shape is expected to 4. But recieved "
+            "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(), phi::make_ddim(param_shape)));
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index f63fe4b96cbeb..b32f2e40933ac 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -45,13 +45,13 @@ struct NormConvolutionArgs {
     PADDLE_ENFORCE_EQ(
         input_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of input_shape is expected to 4. But recieved "
+            "The size of input_shape is expected to 4. But received "
             "input_shape's size is %d, input_shape is [%s].",
             input_shape.size(), phi::make_ddim(input_shape)));
     PADDLE_ENFORCE_EQ(
         filter_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of filter_shape is expected to 4. But recieved "
+            "The size of filter_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             filter_shape.size(), phi::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] &&
@@ -59,20 +59,20 @@ struct NormConvolutionArgs {
                       true,
                       platform::errors::InvalidArgument(
                           "The filter_shape is expected to store as nhwc, and "
-                          "h = w = 1 or 3. But recieved filter_shape is [%s].",
+                          "h = w = 1 or 3. But received filter_shape is [%s].",
                           phi::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0),
                       true,
                       platform::errors::InvalidArgument(
                           "The input channel is expected to be multiple of 8, "
                           "and the output channel is expected to be multiple "
-                          "of 32. But recieved input channel is %d, output "
+                          "of 32. But received input channel is %d, output "
                           "channel is %d.",
                           filter_shape[3], filter_shape[0]));
     PADDLE_ENFORCE_EQ(
         output_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of output_shape is expected to 4. But recieved "
+            "The size of output_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             output_shape.size(), phi::make_ddim(output_shape)));
     is_support = IsSupport(ctx, filter_shape, stride, dilation, group);
@@ -83,7 +83,7 @@ struct NormConvolutionArgs {
             "compatiblity greater than or equal to 70 and the kernel size "
             "must be equal to 1 or 3. When the kernel size is 1, "
             "the stride must be 1 if the compatiblity is equal to 70. "
-            "Besides, the dilation and group must be equal to 1. But recieved "
+            "Besides, the dilation and group must be equal to 1. But received "
             "compatiblity is %d, kernel size is %d, stride is %d, "
             "dilation is %d, group is %d",
             ctx.GetComputeCapability(), filter_shape[1], stride, dilation,
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 9d3090a7179f0..c8588b0c02e9d 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -43,19 +43,19 @@ struct ScaleBiasAddReluArgs {
     PADDLE_ENFORCE_EQ(
         data_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of data_shape is expected to 4. But recieved "
+            "The size of data_shape is expected to 4. But received "
             "data_shape's size is %d, data_shape is [%s].",
             data_shape.size(), phi::make_ddim(data_shape)));
     PADDLE_ENFORCE_EQ(
         param_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of param_shape is expected to 4. But recieved "
+            "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(), phi::make_ddim(param_shape)));
     PADDLE_ENFORCE_EQ(
         bitmask_shape.size(), 3U,
         platform::errors::InvalidArgument(
-            "The size of bitmask_shape is expected to 3. But recieved "
+            "The size of bitmask_shape is expected to 3. But received "
             "bitmask_shape's size is %d, bitmask_shape is [%s].",
             bitmask_shape.size(), phi::make_ddim(bitmask_shape)));
 
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 6eb5881112f89..3d75d127ab60a 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_impl.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
@@ -148,18 +149,24 @@ class FMHARef {
                      stride_b);
     int softmax_axis = -1;
     if (src_mask_tensor != nullptr) {
-      std::vector<const Tensor*> ins;
-      std::vector<Tensor*> outs;
-      ins.emplace_back(qk_out_tensor);
-      ins.emplace_back(src_mask_tensor);
-      outs.emplace_back(src_mask_out_tensor);
-      int elewise_add_axis = -1;
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+      if (src_mask_out_tensor == nullptr && seq_len_ == out_seq_len) {
+        LaunchFusedSoftmaxMaskKernel<T>(qk_out_data, src_mask_tensor->data<T>(),
+                                        softmax_out_data, batch_size_,
+                                        num_head_, seq_len_, dev_ctx_.stream());
+      } else {
+        std::vector<const Tensor*> ins;
+        std::vector<Tensor*> outs;
+        ins.emplace_back(qk_out_tensor);
+        ins.emplace_back(src_mask_tensor);
+        outs.emplace_back(src_mask_out_tensor);
+        int elewise_add_axis = -1;
+        paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
+                                                       T, T>(
+            dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
 
-      phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
-                                             softmax_axis, softmax_out_tensor);
+        phi::SoftmaxForwardCUDAKernelDriver<T>(
+            dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor);
+      }
     } else {
       phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out_tensor,
                                              softmax_axis, softmax_out_tensor);
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
index 27dae27751681..1b5b074ef1c71 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
@@ -76,7 +76,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute x_num_col_dims used to flatten input X to "
             "a 2-D tensor, is expected to be less than the number of "
-            "input X's dimensions. But recieved x_num_col_dims is %d, "
+            "input X's dimensions. But received x_num_col_dims is %d, "
             "the number of input X's dimensions is %d, input X's shape is %s.",
             x_num_col_dims, x_dims.size(), x_dims));
 
@@ -85,7 +85,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         x_mat_dims[1], w_dims[0],
         platform::errors::InvalidArgument(
             "The input's second dimension and weight's first dimension is "
-            "expected to be the same. But recieved input's second dimension is "
+            "expected to be the same. But received input's second dimension is "
             "%d, input's shape is %s; weight's first dimension is %d, weight's "
             "shape is %s.",
             x_mat_dims[1], x_mat_dims, w_dims[0], w_dims));
@@ -100,7 +100,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(phi::make_ddim(fc_out_dims), y_dims,
                       platform::errors::InvalidArgument(
                           "The output's shape of fc is expected to be equal to "
-                          "that of input Y. But recieved output's shape of fc "
+                          "that of input Y. But received output's shape of fc "
                           "is %s, input Y's shape is %s.",
                           phi::make_ddim(fc_out_dims), y_dims));
 
@@ -110,7 +110,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute begin_norm_axis used to flatten input Y to a 2-D "
             "tensor, is expected to be less than the number of input Y's "
-            "dimensions. But recieved begin_norm_axis is %d, the number of "
+            "dimensions. But received begin_norm_axis is %d, the number of "
             "input Y's dimensions is %d, input Y's shape is %s.",
             begin_norm_axis, y_dims.size(), y_dims));
 
@@ -122,7 +122,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(scale_dims.size(), 1,
                         platform::errors::InvalidArgument(
                             "The input Scale is expected to be an 1-D tensor. "
-                            "But recieved the number of input Scale's "
+                            "But received the number of input Scale's "
                             "dimensions is %d, input Scale's shape is %s.",
                             scale_dims.size(), scale_dims));
 
@@ -132,7 +132,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
             platform::errors::InvalidArgument(
                 "The first dimension of input Scale is expected to be equal to "
                 "the second dimension of input Y after flattened. "
-                "But recieved the first dimension of input Scale is %d, input "
+                "But received the first dimension of input Scale is %d, input "
                 "Scale's shape is %s; the second dimension of flattened input "
                 "Y is %d, input Y's shape is %s, flattened axis is %d.",
                 scale_dims[0], scale_dims, dim_1, y_dims, begin_norm_axis));
@@ -144,7 +144,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
           bias1_dims.size(), 1,
           platform::errors::InvalidArgument(
               "The input Bias1 is expected to be an 1-D tensor. "
-              "But recieved the number of input Bias1's dimension is %d, "
+              "But received the number of input Bias1's dimension is %d, "
               "input Bias1's shape is %s.",
               bias1_dims.size(), bias1_dims));
 
@@ -154,7 +154,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
             platform::errors::InvalidArgument(
                 "The first dimension of input Bias1 is expected to be equal to "
                 "the second dimension of input Y after flattened. "
-                "But recieved the first dimension of input Bias1 is %d, input "
+                "But received the first dimension of input Bias1 is %d, input "
                 "Bias1's shape is %s; the second dimension of flatten input "
                 "Y is %d, input Y's shape is %s, flattened axis is %d.",
                 bias1_dims[0], bias1_dims, dim_1, y_dims, begin_norm_axis));
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index e38ac9a0ad2da..fdd0208c3d316 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -1084,11 +1084,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto *qk_out_data =
         qk_out.mutable_data<T>({bsz, num_head, seq_len, out_seq_len}, place);
 
-    Tensor src_mask_out, softmax_out;
+    Tensor softmax_out;
     Tensor attn_dropout_mask_out, attn_dropout_out;
     Tensor qktv_out, fmha_out;
-    auto *src_mask_out_data = src_mask_out.mutable_data<T>(
-        {bsz, num_head, seq_len, out_seq_len}, place);
     auto *softmax_out_data = softmax_out.mutable_data<T>(
         {bsz, num_head, seq_len, out_seq_len}, place);
 
@@ -1219,10 +1217,10 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                 1. / sqrt(dim_head));
       } else if (cache_kv_out) {  // generation context stage
         // TODO(wangxi): can remove dropout in inference
-        fmha_compute.ComputeForward(
-            qkv_out, nullptr, src_mask, &transpose_out_2, nullptr, &qk_out,
-            &src_mask_out, &softmax_out, &attn_dropout_mask_out,
-            &attn_dropout_out, &qktv_out, &fmha_out);
+        fmha_compute.ComputeForward(qkv_out, nullptr, src_mask,
+                                    &transpose_out_2, nullptr, &qk_out, nullptr,
+                                    &softmax_out, &attn_dropout_mask_out,
+                                    &attn_dropout_out, &qktv_out, &fmha_out);
         // [3, bsz, num_head, seq_len, head_dim]
         T *qkv_data = transpose_out_2_data;
         int64_t q_size = bsz * seq_len * num_head * dim_head;
@@ -1245,7 +1243,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         // TODO(wangxi): can remove dropout in inference
         fmha_compute.ComputeForward(
             qkv_out, cache_kv, src_mask, &transpose_out_2, cache_kv_out,
-            &qk_out, &src_mask_out, &softmax_out, &attn_dropout_mask_out,
+            &qk_out, nullptr, &softmax_out, &attn_dropout_mask_out,
             &attn_dropout_out, &qktv_out, &fmha_out);
       }
 #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
new file mode 100644
index 0000000000000..11f1011dec3a2
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
@@ -0,0 +1,204 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+namespace plat = paddle::platform;
+
+#define FINAL_MASK 0xffffffff
+#define DIV_UP(x, y) (((x) + (y)-1) / (y))
+
+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  return val;
+}
+
+template <typename T>
+__inline__ __device__ T warpReduceMax(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val = max(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));
+  return val;
+}
+
+inline int ElementsCeil(int seq_len) {
+  int elements = 1;
+  while (elements * 32 < seq_len) elements *= 2;
+  return elements;
+}
+
+template <typename T, int VEC_SIZE, int ELEMENTS_PER_THREADS>
+__global__ void FusedSoftmaxMaskVecKernel(T* dst, const T* src, const T* mask,
+                                          int seq_len) {
+  constexpr int block_size = 128;
+  constexpr int warp_size = 32;
+  constexpr int warps_per_block = block_size / warp_size;
+
+  // blockDim/threadIdx = (warp_size, warps_per_block)
+  // gridDim/blockIdx = (DIV_UP(seq_len, warps_per_block), batch_size, head_num)
+  // every block processes 4(warps_per_block) sequences
+  // seq_id = seq_id * 4 + warp_id, eg.seq_len=128, 127=31*4+3
+  int seq_id = blockIdx.x * warps_per_block + threadIdx.y;
+  if (seq_id >= seq_len) return;
+
+  // ((bid*head_num + hid)*seq_len + seq_id) * seq_len
+  int offset =
+      ((blockIdx.y * gridDim.z + blockIdx.z) * seq_len + seq_id) * seq_len;
+  // (bid * seq_len + seq_id) * seq_len
+  int mask_offset = (blockIdx.y * seq_len + seq_id) * seq_len;
+  src += offset;
+  dst += offset;
+  mask += mask_offset;
+
+  static_assert(ELEMENTS_PER_THREADS % VEC_SIZE == 0, "");
+  constexpr int VEC_NUMS = ELEMENTS_PER_THREADS / VEC_SIZE;
+  using VecT = phi::AlignedVector<T, VEC_SIZE>;
+
+  VecT elements[VEC_NUMS];
+  VecT tmp_mask;
+  float max_val = -std::numeric_limits<float>::infinity();
+
+  for (int i = 0; (i * warp_size + threadIdx.x) * VEC_SIZE < seq_len; ++i) {
+    phi::Load(src + (i * warp_size + threadIdx.x) * VEC_SIZE, &elements[i]);
+    phi::Load(mask + (i * warp_size + threadIdx.x) * VEC_SIZE, &tmp_mask);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      // TODO(wangxi): vec add
+      elements[i][j] += tmp_mask[j];
+      max_val = max(max_val, static_cast<float>(elements[i][j]));
+    }
+  }
+  max_val = warpReduceMax(max_val);
+
+  float sum_val = 0;
+  for (int i = 0; (i * warp_size + threadIdx.x) * VEC_SIZE < seq_len; ++i) {
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      float tmp = __expf(static_cast<float>(elements[i][j]) - max_val);
+      sum_val += tmp;
+      elements[i][j] = static_cast<T>(tmp);
+    }
+  }
+  sum_val = warpReduceSum(sum_val);
+  float mean_val = __fdividef(1.0f, sum_val + 1e-6f);
+
+  for (int i = 0; (i * warp_size + threadIdx.x) * VEC_SIZE < seq_len; ++i) {
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      float tmp = static_cast<float>(elements[i][j]) * mean_val;
+      elements[i][j] = static_cast<T>(tmp);
+    }
+    phi::Store(elements[i], dst + (i * warp_size + threadIdx.x) * VEC_SIZE);
+  }
+}
+
+#define SOFTMAX_MASK_KERNEL(VEC_SIZE, ELEMENTS)                    \
+  FusedSoftmaxMaskVecKernel<T, VEC_SIZE,                           \
+                            ELEMENTS><<<grid, block, 0, stream>>>( \
+      dst, src, mask, seq_len)
+
+// FIXME(wangxi): It is found that the performance of VEC_SIZE=2 is better
+//  than that of =4 and =8. Further analysis of the kernel is needed later.
+// #define SELECT_SOFTMAX_MASK_KERNEL(ELEMENTS) \
+//   do { \
+//     if (sizeof(T) == 2 && seq_len % 8 == 0) { \
+//       FusedSoftmaxMaskVecKernel<plat::float16, 8, ELEMENTS> \
+//            <<<grid, block, 0, stream>>>( \
+//           (plat::float16*)dst, (const plat::float16*)src, mask, seq_len); \
+//     } \
+//     else if (seq_len % 4 == 0) SOFTMAX_MASK_KERNEL(4, ELEMENTS); \
+//     else if (seq_len % 2 == 0) SOFTMAX_MASK_KERNEL(2, ELEMENTS); \
+//     else SOFTMAX_MASK_KERNEL(1, ELEMENTS);   \
+//   } while(0)
+
+#define SELECT_SOFTMAX_MASK_KERNEL(ELEMENTS) \
+  do {                                       \
+    if (seq_len % 2 == 0) {                  \
+      SOFTMAX_MASK_KERNEL(2, ELEMENTS);      \
+    } else {                                 \
+      SOFTMAX_MASK_KERNEL(1, ELEMENTS);      \
+    }                                        \
+  } while (0)
+
+#define CASE_SOFTMAX_MASK_KERNEL(ELEMENTS) \
+  case ELEMENTS: {                         \
+    SELECT_SOFTMAX_MASK_KERNEL(ELEMENTS);  \
+    break;                                 \
+  }
+
+// template <typename T, typename MaskT = T>
+template <typename T>
+void LaunchFusedSoftmaxMaskKernel(const T* src, const T* mask, T* dst,
+                                  const int batch_size, const int head_num,
+                                  const int seq_len, cudaStream_t stream) {
+  PADDLE_ENFORCE_EQ(
+      seq_len > 0 && seq_len <= 4096, true,
+      platform::errors::InvalidArgument("seq_len must be between (0, 4096] "
+                                        "received the seq_len is %d",
+                                        seq_len));
+
+  constexpr int block_size = 128;
+  constexpr int warp_size = 32;
+  constexpr int warps_per_block = block_size / warp_size;
+
+  // put head_num to the outside for mask
+  dim3 block(warp_size, warps_per_block);
+  dim3 grid(DIV_UP(seq_len, warps_per_block), batch_size, head_num);
+
+  // clang-format off
+  int elements = ElementsCeil(seq_len);
+  switch (elements) {
+    case 1: {  // <=32
+      SOFTMAX_MASK_KERNEL(1, 1);
+      break;
+    }
+    case 2: {  // <=64
+      // if (seq_len % 2 == 0) SOFTMAX_MASK_KERNEL(2, 2);
+      // else SOFTMAX_MASK_KERNEL(1, 2);
+      SELECT_SOFTMAX_MASK_KERNEL(2);
+      break;
+    }
+    case 4: {  // <=128
+      // if (seq_len % 4 == 0) SOFTMAX_MASK_KERNEL(4, 4);
+      // else if (seq_len % 2 == 0) SOFTMAX_MASK_KERNEL(2, 4);
+      // else SOFTMAX_MASK_KERNEL(1, 4);
+      SELECT_SOFTMAX_MASK_KERNEL(4);
+      break;
+    }
+    CASE_SOFTMAX_MASK_KERNEL(8);    // <=256
+    CASE_SOFTMAX_MASK_KERNEL(16);   // <=512
+    CASE_SOFTMAX_MASK_KERNEL(32);   // <=1024
+    CASE_SOFTMAX_MASK_KERNEL(64);   // <=2048
+    CASE_SOFTMAX_MASK_KERNEL(128);  // <=4096
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "seq_len must be between (0, 4096], received the seq_len is %d",
+          seq_len));
+  }
+  // clang-format on
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
index 738e069081511..1ebbdf792df85 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -52,7 +52,7 @@ class FusionGroupOp : public framework::OperatorWithKernel {
             x_dims[0], x_dims[i],
             platform::errors::InvalidArgument(
                 "All the inputs' dims is expected to be the same. "
-                "But recieved [%s] (name: %s) vs [%s] (name: %s).",
+                "But received [%s] (name: %s) vs [%s] (name: %s).",
                 x_dims[0], input_names[0], x_dims[i], input_names[i]));
       }
       std::vector<framework::DDim> out_dims;
diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cc b/paddle/fluid/operators/fused/yolo_box_head_op.cc
new file mode 100644
index 0000000000000..58df4e61bbbdf
--- /dev/null
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class YoloBoxHeadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "yolo_box_head");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "yolo_box_head");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+};
+
+class YoloBoxHeadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "The input tensor");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.");
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddOutput("Out", "The output tensor");
+    AddComment(R"DOC(
+              yolo_box_head Operator.
+              )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolo_box_head, ops::YoloBoxHeadOp, ops::YoloBoxHeadOpMaker);
diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cu b/paddle/fluid/operators/fused/yolo_box_head_op.cu
new file mode 100644
index 0000000000000..4c79e22d1a536
--- /dev/null
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+inline __device__ T SigmoidGPU(const T& x) {
+  return 1.0f / (1.0f + __expf(-x));
+}
+
+template <typename T>
+__global__ void YoloBoxHeadCudaKernel(const T* input, T* output,
+                                      const int grid_size_x,
+                                      const int grid_size_y,
+                                      const int class_num,
+                                      const int anchors_num) {
+  int x_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int z_id = blockIdx.z * blockDim.z + threadIdx.z;
+  if ((x_id >= grid_size_x) || (y_id >= grid_size_y) || (z_id >= anchors_num)) {
+    return;
+  }
+  const int grids_num = grid_size_x * grid_size_y;
+  const int bbindex = y_id * grid_size_x + x_id;
+
+  // objectness
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 4)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 4)]);
+  // x
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 0)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 0)]);
+  // y
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 1)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 1)]);
+  // w
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 2)] =
+      __expf(input[bbindex + grids_num * (z_id * (5 + class_num) + 2)]);
+  // h
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 3)] =
+      __expf(input[bbindex + grids_num * (z_id * (5 + class_num) + 3)]);
+  // Probabilities of classes
+  for (int i = 0; i < class_num; ++i) {
+    output[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))] =
+        SigmoidGPU(
+            input[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))]);
+  }
+}
+
+template <typename T>
+class YoloBoxHeadKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto anchors = context.Attr<std::vector<int>>("anchors");
+    auto class_num = context.Attr<int>("class_num");
+    auto& device_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    auto x_dims = x->dims();
+    const int batch_size = x_dims[0];
+    const int h = x_dims[2];
+    const int w = x_dims[3];
+    const int grid_size_x = w;
+    const int grid_size_y = h;
+    const int anchors_num = anchors.size() / 2;
+    const T* input_data = x->data<T>();
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto stream = device_ctx.stream();
+    const int volume = x_dims[1] * h * w;
+    dim3 block(16, 16, 4);
+    dim3 grid((grid_size_x / block.x) + 1, (grid_size_y / block.y) + 1,
+              (anchors_num / block.z) + 1);
+    for (int n = 0; n < batch_size; n++) {
+      YoloBoxHeadCudaKernel<<<grid, block, 0, stream>>>(
+          input_data + n * volume, output_data + n * volume, grid_size_x,
+          grid_size_y, class_num, anchors_num);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(yolo_box_head, ops::YoloBoxHeadKernel<float>);
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cc b/paddle/fluid/operators/fused/yolo_box_post_op.cc
new file mode 100644
index 0000000000000..674944173698b
--- /dev/null
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class YoloBoxPostOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("Boxes0"), "Input", "Boxes0", "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasInput("Boxes1"), "Input", "Boxes1", "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasInput("Boxes2"), "Input", "Boxes2", "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasInput("ImageShape"), "Input", "ImageShape",
+                   "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasInput("ImageScale"), "Input", "ImageScale",
+                   "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasOutput("NmsRoisNum"), "Output", "NmsRoisNum",
+                   "yolo_box_post");
+  }
+};
+
+class YoloBoxPostOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("Boxes0", "The Boxes0 tensor");
+    AddInput("Boxes1", "The Boxes1 tensor");
+    AddInput("Boxes2", "The Boxes2 tensor");
+    AddInput("ImageShape", "The height and width of each input image.");
+    AddInput("ImageScale", "The scale factor of ImageShape.");
+    AddAttr<std::vector<int>>("anchors0", "The anchors of Boxes0.");
+    AddAttr<std::vector<int>>("anchors1", "The anchors of Boxes1.");
+    AddAttr<std::vector<int>>("anchors2", "The anchors of Boxes2.");
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<float>("conf_thresh",
+                   "The confidence scores threshold of detection boxes. "
+                   "Boxes with confidence scores under threshold should "
+                   "be ignored.");
+    AddAttr<int>("downsample_ratio0", "The downsample ratio of Boxes0.");
+    AddAttr<int>("downsample_ratio1", "The downsample ratio of Boxes1.");
+    AddAttr<int>("downsample_ratio2", "The downsample ratio of Boxes2.");
+    AddAttr<bool>("clip_bbox",
+                  "Whether clip output bonding box in Input(ImgSize) "
+                  "boundary. Default true.");
+    AddAttr<float>("scale_x_y",
+                   "Scale the center point of decoded bounding "
+                   "box. Default 1.0");
+    AddAttr<float>("nms_threshold", "The threshold to be used in NMS.");
+    AddOutput("Out", "The output tensor");
+    AddOutput("NmsRoisNum", "The output RoIs tensor");
+    AddComment(R"DOC(
+        yolo_box_post Operator.
+        )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolo_box_post, ops::YoloBoxPostOp, ops::YoloBoxPostOpMaker);
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
new file mode 100644
index 0000000000000..4438a4c7dd812
--- /dev/null
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -0,0 +1,519 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+struct Box {
+  float x, y, w, h;
+};
+
+struct Detection {
+  Box bbox;
+  int classes;
+  float* prob;
+  float* mask;
+  float objectness;
+  int sort_class;
+  int max_prob_class_index;
+};
+
+struct TensorInfo {
+  int bbox_count_host;  // record bbox numbers
+  int bbox_count_max_alloc{50};
+  float* bboxes_dev_ptr;
+  float* bboxes_host_ptr;
+  int* bbox_count_device_ptr;  // Box counter in gpu memory, used by atomicAdd
+};
+
+static int NMSComparator(const void* pa, const void* pb) {
+  const Detection a = *reinterpret_cast<const Detection*>(pa);
+  const Detection b = *reinterpret_cast<const Detection*>(pb);
+  if (a.max_prob_class_index > b.max_prob_class_index)
+    return 1;
+  else if (a.max_prob_class_index < b.max_prob_class_index)
+    return -1;
+
+  float diff = 0;
+  if (b.sort_class >= 0) {
+    diff = a.prob[b.sort_class] - b.prob[b.sort_class];
+  } else {
+    diff = a.objectness - b.objectness;
+  }
+
+  if (diff < 0)
+    return 1;
+  else if (diff > 0)
+    return -1;
+  return 0;
+}
+
+static float Overlap(float x1, float w1, float x2, float w2) {
+  float l1 = x1 - w1 / 2;
+  float l2 = x2 - w2 / 2;
+  float left = l1 > l2 ? l1 : l2;
+  float r1 = x1 + w1 / 2;
+  float r2 = x2 + w2 / 2;
+  float right = r1 < r2 ? r1 : r2;
+  return right - left;
+}
+
+static float BoxIntersection(Box a, Box b) {
+  float w = Overlap(a.x, a.w, b.x, b.w);
+  float h = Overlap(a.y, a.h, b.y, b.h);
+  if (w < 0 || h < 0) return 0;
+  float area = w * h;
+  return area;
+}
+
+static float BoxUnion(Box a, Box b) {
+  float i = BoxIntersection(a, b);
+  float u = a.w * a.h + b.w * b.h - i;
+  return u;
+}
+
+static float BoxIOU(Box a, Box b) {
+  return BoxIntersection(a, b) / BoxUnion(a, b);
+}
+
+static void PostNMS(std::vector<Detection>* det_bboxes, float thresh,
+                    int classes) {
+  int total = det_bboxes->size();
+  if (total <= 0) {
+    return;
+  }
+
+  Detection* dets = det_bboxes->data();
+  int i, j, k;
+  k = total - 1;
+  for (i = 0; i <= k; ++i) {
+    if (dets[i].objectness == 0) {
+      Detection swap = dets[i];
+      dets[i] = dets[k];
+      dets[k] = swap;
+      --k;
+      --i;
+    }
+  }
+  total = k + 1;
+
+  qsort(dets, total, sizeof(Detection), NMSComparator);
+
+  for (i = 0; i < total; ++i) {
+    if (dets[i].objectness == 0) continue;
+    Box a = dets[i].bbox;
+    for (j = i + 1; j < total; ++j) {
+      if (dets[j].objectness == 0) continue;
+      if (dets[j].max_prob_class_index != dets[i].max_prob_class_index) break;
+      Box b = dets[j].bbox;
+      if (BoxIOU(a, b) > thresh) {
+        dets[j].objectness = 0;
+        for (k = 0; k < classes; ++k) {
+          dets[j].prob[k] = 0;
+        }
+      }
+    }
+  }
+}
+
+__global__ void YoloBoxNum(const float* input, int* bbox_count,
+                           const int grid_size, const int class_num,
+                           const int anchors_num, float prob_thresh) {
+  int x_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int z_id = blockIdx.z * blockDim.z + threadIdx.z;
+  if ((x_id >= grid_size) || (y_id >= grid_size) || (z_id >= anchors_num)) {
+    return;
+  }
+
+  const int grids_num = grid_size * grid_size;
+  const int bbindex = y_id * grid_size + x_id;
+  float objectness = input[bbindex + grids_num * (z_id * (5 + class_num) + 4)];
+  if (objectness < prob_thresh) {
+    return;
+  }
+
+  atomicAdd(bbox_count, 1);
+}
+
+__global__ void YoloTensorParseKernel(
+    const float* input, const float* im_shape_data, const float* im_scale_data,
+    float* output, int* bbox_index, const int grid_size, const int class_num,
+    const int anchors_num, const int netw, const int neth, int* biases,
+    float prob_thresh) {
+  int x_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int z_id = blockIdx.z * blockDim.z + threadIdx.z;
+  if ((x_id >= grid_size) || (y_id >= grid_size) || (z_id >= anchors_num)) {
+    return;
+  }
+
+  const float pic_h = im_shape_data[0] / im_scale_data[0];
+  const float pic_w = im_shape_data[1] / im_scale_data[1];
+  const int grids_num = grid_size * grid_size;
+  const int bbindex = y_id * grid_size + x_id;
+  float objectness = input[bbindex + grids_num * (z_id * (5 + class_num) + 4)];
+  if (objectness < prob_thresh) {
+    return;
+  }
+
+  int cur_bbox_index = atomicAdd(bbox_index, 1);
+  int tensor_index = cur_bbox_index * (5 + class_num);
+
+  // x
+  float x = input[bbindex + grids_num * (z_id * (5 + class_num) + 0)];
+  x = (x + static_cast<float>(x_id)) * static_cast<float>(pic_w) /
+      static_cast<float>(grid_size);
+  // y
+  float y = input[bbindex + grids_num * (z_id * (5 + class_num) + 1)];
+  y = (y + static_cast<float>(y_id)) * static_cast<float>(pic_h) /
+      static_cast<float>(grid_size);
+  // w
+  float w = input[bbindex + grids_num * (z_id * (5 + class_num) + 2)];
+  w = w * biases[2 * z_id] * pic_w / netw;
+  // h
+  float h = input[bbindex + grids_num * (z_id * (5 + class_num) + 3)];
+  h = h * biases[2 * z_id + 1] * pic_h / neth;
+
+  output[tensor_index] = objectness;
+  output[tensor_index + 1] = x - w / 2;
+  output[tensor_index + 2] = y - h / 2;
+  output[tensor_index + 3] = x + w / 2;
+  output[tensor_index + 4] = y + h / 2;
+  output[tensor_index + 1] =
+      output[tensor_index + 1] > 0 ? output[tensor_index + 1] : 0.f;
+  output[tensor_index + 2] =
+      output[tensor_index + 2] > 0 ? output[tensor_index + 2] : 0.f;
+  output[tensor_index + 3] = output[tensor_index + 3] < pic_w - 1
+                                 ? output[tensor_index + 3]
+                                 : pic_w - 1;
+  output[tensor_index + 4] = output[tensor_index + 4] < pic_h - 1
+                                 ? output[tensor_index + 4]
+                                 : pic_h - 1;
+
+  // Probabilities of classes
+  for (int i = 0; i < class_num; ++i) {
+    float prob =
+        input[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))] *
+        objectness;
+    output[tensor_index + 5 + i] = prob;
+  }
+}
+
+static void YoloTensorParseCuda(
+    const float* input_data,  // [in] YOLO_BOX_HEAD layer output
+    const float* image_shape_data, const float* image_scale_data,
+    float** bboxes_tensor_ptr,  // [out] Bounding boxes output tensor
+    int* bbox_count_max_alloc,  // [in/out] maximum bounding Box number
+                                // allocated in dev
+    int* bbox_count_host,  // [in/out] bounding boxes number recorded in host
+    int* bbox_count_device_ptr,  // [in/out] bounding boxes number calculated
+                                 // in
+                                 // device side
+    int* bbox_index_device_ptr,  // [in] bounding Box index for kernel threads
+                                 // shared access
+    int grid_size, int class_num, int anchors_num, int netw, int neth,
+    int* biases_device, float prob_thresh) {
+  dim3 threads_per_block(16, 16, 4);
+  dim3 number_of_blocks((grid_size / threads_per_block.x) + 1,
+                        (grid_size / threads_per_block.y) + 1,
+                        (anchors_num / threads_per_block.z) + 1);
+
+  // Estimate how many boxes will be choosed
+  int bbox_count = 0;
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(bbox_count_device_ptr, &bbox_count, sizeof(int),
+            hipMemcpyHostToDevice);
+#else
+  cudaMemcpy(bbox_count_device_ptr, &bbox_count, sizeof(int),
+             cudaMemcpyHostToDevice);
+#endif
+  YoloBoxNum<<<number_of_blocks, threads_per_block, 0>>>(
+      input_data, bbox_count_device_ptr, grid_size, class_num, anchors_num,
+      prob_thresh);
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(&bbox_count, bbox_count_device_ptr, sizeof(int),
+            hipMemcpyDeviceToHost);
+#else
+  cudaMemcpy(&bbox_count, bbox_count_device_ptr, sizeof(int),
+             cudaMemcpyDeviceToHost);
+#endif
+
+  // Record actual bbox number
+  *bbox_count_host = bbox_count;
+
+  // Obtain previous allocated bbox tensor in device side
+  float* bbox_tensor = *bboxes_tensor_ptr;
+  // Update previous maximum bbox number
+  if (bbox_count > *bbox_count_max_alloc) {
+#ifdef PADDLE_WITH_HIP
+    hipFree(bbox_tensor);
+    hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
+#else
+    cudaFree(bbox_tensor);
+    cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
+#endif
+    *bbox_count_max_alloc = bbox_count;
+    *bboxes_tensor_ptr = bbox_tensor;
+  }
+
+  // Now generate bboxes
+  int bbox_index = 0;
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(bbox_index_device_ptr, &bbox_index, sizeof(int),
+            hipMemcpyHostToDevice);
+#else
+  cudaMemcpy(bbox_index_device_ptr, &bbox_index, sizeof(int),
+             cudaMemcpyHostToDevice);
+#endif
+  YoloTensorParseKernel<<<number_of_blocks, threads_per_block, 0>>>(
+      input_data, image_shape_data, image_scale_data, bbox_tensor,
+      bbox_index_device_ptr, grid_size, class_num, anchors_num, netw, neth,
+      biases_device, prob_thresh);
+}
+
+template <typename T>
+class YoloBoxPostKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    // prepare inputs
+    std::vector<const float*> boxes_input(3);
+    std::vector<std::vector<int32_t>> boxes_input_dims(3);
+    for (int i = 0; i < 3; i++) {
+      auto* boxes_tensor =
+          context.Input<framework::Tensor>("Boxes" + std::to_string(i));
+      boxes_input[i] = boxes_tensor->data<float>();
+      auto dims = boxes_tensor->dims();
+      for (int j = 0; j < dims.size(); j++) {
+        boxes_input_dims[i].push_back(dims[j]);
+      }
+    }
+    const float* image_shape_data =
+        context.Input<framework::Tensor>("ImageShape")->data<float>();
+    const float* image_scale_data =
+        context.Input<framework::Tensor>("ImageScale")->data<float>();
+
+    // prepare outputs
+    auto* boxes_scores_tensor = context.Output<framework::Tensor>("Out");
+    auto* boxes_num_tensor = context.Output<framework::Tensor>("NmsRoisNum");
+
+    // prepare anchors
+    std::vector<int32_t> anchors;
+    auto anchors0 = context.Attr<std::vector<int>>("anchors0");
+    auto anchors1 = context.Attr<std::vector<int>>("anchors1");
+    auto anchors2 = context.Attr<std::vector<int>>("anchors2");
+    anchors.insert(anchors.end(), anchors0.begin(), anchors0.end());
+    anchors.insert(anchors.end(), anchors1.begin(), anchors1.end());
+    anchors.insert(anchors.end(), anchors2.begin(), anchors2.end());
+    int* device_anchors;
+#ifdef PADDLE_WITH_HIP
+    hipMalloc(reinterpret_cast<void**>(&device_anchors),
+              anchors.size() * sizeof(int));
+    hipMemcpy(device_anchors, anchors.data(), anchors.size() * sizeof(int),
+              hipMemcpyHostToDevice);
+#else
+    cudaMalloc(reinterpret_cast<void**>(&device_anchors),
+               anchors.size() * sizeof(int));
+    cudaMemcpy(device_anchors, anchors.data(), anchors.size() * sizeof(int),
+               cudaMemcpyHostToDevice);
+#endif
+    int* device_anchors_ptr[3];
+    device_anchors_ptr[0] = device_anchors;
+    device_anchors_ptr[1] = device_anchors_ptr[0] + anchors0.size();
+    device_anchors_ptr[2] = device_anchors_ptr[1] + anchors1.size();
+    std::vector<int> anchors_num{static_cast<int>(anchors0.size()) / 2,
+                                 static_cast<int>(anchors1.size()) / 2,
+                                 static_cast<int>(anchors2.size()) / 2};
+
+    // prepare other attrs
+    int class_num = context.Attr<int>("class_num");
+    float conf_thresh = context.Attr<float>("conf_thresh");
+    std::vector<int> downsample_ratio{context.Attr<int>("downsample_ratio0"),
+                                      context.Attr<int>("downsample_ratio1"),
+                                      context.Attr<int>("downsample_ratio2")};
+    // clip_bbox and scale_x_y is not used now!
+    float nms_threshold = context.Attr<float>("nms_threshold");
+
+    int batch = context.Input<framework::Tensor>("ImageShape")->dims()[0];
+    TensorInfo* ts_info = new TensorInfo[batch * boxes_input.size()];
+    for (int i = 0; i < batch * static_cast<int>(boxes_input.size()); i++) {
+#ifdef PADDLE_WITH_HIP
+      hipMalloc(
+          reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
+#else
+      cudaMalloc(
+          reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
+#endif
+      ts_info[i].bboxes_host_ptr = reinterpret_cast<float*>(malloc(
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)));
+#ifdef PADDLE_WITH_HIP
+      hipMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
+                sizeof(int));
+#else
+      cudaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
+                 sizeof(int));
+#endif
+    }
+
+    // Box index counter in gpu memory
+    // *bbox_index_device_ptr used by atomicAdd
+    int* bbox_index_device_ptr;
+#ifdef PADDLE_WITH_HIP
+    hipMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
+#else
+    cudaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
+#endif
+
+    int total_bbox = 0;
+    for (int batch_id = 0; batch_id < batch; batch_id++) {
+      for (int input_id = 0; input_id < boxes_input.size(); input_id++) {
+        int c = boxes_input_dims[input_id][1];
+        int h = boxes_input_dims[input_id][2];
+        int w = boxes_input_dims[input_id][3];
+        int ts_id = batch_id * boxes_input.size() + input_id;
+        int bbox_count_max_alloc = ts_info[ts_id].bbox_count_max_alloc;
+
+        YoloTensorParseCuda(
+            boxes_input[input_id] + batch_id * c * h * w,
+            image_shape_data + batch_id * 2, image_scale_data + batch_id * 2,
+            &(ts_info[ts_id].bboxes_dev_ptr),  // output in gpu,must use 2-level
+                                               // pointer, because we may
+                                               // re-malloc
+            &bbox_count_max_alloc,             // bbox_count_alloc_ptr boxes we
+                                               // pre-allocate
+            &(ts_info[ts_id].bbox_count_host),     // record bbox numbers
+            ts_info[ts_id].bbox_count_device_ptr,  // for atomicAdd
+            bbox_index_device_ptr,                 // for atomicAdd
+            h, class_num, anchors_num[input_id], downsample_ratio[input_id] * h,
+            downsample_ratio[input_id] * w, device_anchors_ptr[input_id],
+            conf_thresh);
+
+        // batch info update
+        if (bbox_count_max_alloc > ts_info[ts_id].bbox_count_max_alloc) {
+          ts_info[ts_id].bbox_count_max_alloc = bbox_count_max_alloc;
+          ts_info[ts_id].bboxes_host_ptr = reinterpret_cast<float*>(
+              realloc(ts_info[ts_id].bboxes_host_ptr,
+                      bbox_count_max_alloc * (5 + class_num) * sizeof(float)));
+        }
+// we need copy bbox_count_host boxes to cpu memory
+#ifdef PADDLE_WITH_HIP
+        hipMemcpyAsync(
+            ts_info[ts_id].bboxes_host_ptr, ts_info[ts_id].bboxes_dev_ptr,
+            ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
+            hipMemcpyDeviceToHost);
+#else
+        cudaMemcpyAsync(
+            ts_info[ts_id].bboxes_host_ptr, ts_info[ts_id].bboxes_dev_ptr,
+            ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
+            cudaMemcpyDeviceToHost);
+#endif
+        total_bbox += ts_info[ts_id].bbox_count_host;
+      }
+    }
+
+    boxes_scores_tensor->Resize({total_bbox > 0 ? total_bbox : 1, 6});
+    float* boxes_scores_data =
+        boxes_scores_tensor->mutable_data<float>(platform::CPUPlace());
+    memset(boxes_scores_data, 0, sizeof(float) * 6);
+    boxes_num_tensor->Resize({batch});
+    int* boxes_num_data =
+        boxes_num_tensor->mutable_data<int>(platform::CPUPlace());
+    int boxes_scores_id = 0;
+
+    // NMS
+    for (int batch_id = 0; batch_id < batch; batch_id++) {
+      std::vector<Detection> bbox_det_vec;
+      for (int input_id = 0; input_id < boxes_input.size(); input_id++) {
+        int ts_id = batch_id * boxes_input.size() + input_id;
+        int bbox_count = ts_info[ts_id].bbox_count_host;
+        if (bbox_count <= 0) {
+          continue;
+        }
+
+        float* bbox_host_ptr = ts_info[ts_id].bboxes_host_ptr;
+        for (int bbox_index = 0; bbox_index < bbox_count; ++bbox_index) {
+          Detection bbox_det;
+          memset(&bbox_det, 0, sizeof(Detection));
+          bbox_det.objectness = bbox_host_ptr[bbox_index * (5 + class_num) + 0];
+          bbox_det.bbox.x = bbox_host_ptr[bbox_index * (5 + class_num) + 1];
+          bbox_det.bbox.y = bbox_host_ptr[bbox_index * (5 + class_num) + 2];
+          bbox_det.bbox.w =
+              bbox_host_ptr[bbox_index * (5 + class_num) + 3] - bbox_det.bbox.x;
+          bbox_det.bbox.h =
+              bbox_host_ptr[bbox_index * (5 + class_num) + 4] - bbox_det.bbox.y;
+          bbox_det.classes = class_num;
+          bbox_det.prob =
+              reinterpret_cast<float*>(malloc(class_num * sizeof(float)));
+          int max_prob_class_id = -1;
+          float max_class_prob = 0.0;
+          for (int class_id = 0; class_id < class_num; class_id++) {
+            float prob =
+                bbox_host_ptr[bbox_index * (5 + class_num) + 5 + class_id];
+            bbox_det.prob[class_id] = prob;
+            if (prob > max_class_prob) {
+              max_class_prob = prob;
+              max_prob_class_id = class_id;
+            }
+          }
+          bbox_det.max_prob_class_index = max_prob_class_id;
+          bbox_det.sort_class = max_prob_class_id;
+          bbox_det_vec.push_back(bbox_det);
+        }
+      }
+      PostNMS(&bbox_det_vec, nms_threshold, class_num);
+      for (int i = 0; i < bbox_det_vec.size(); i++) {
+        boxes_scores_data[boxes_scores_id++] =
+            bbox_det_vec[i].max_prob_class_index;
+        boxes_scores_data[boxes_scores_id++] = bbox_det_vec[i].objectness;
+        boxes_scores_data[boxes_scores_id++] = bbox_det_vec[i].bbox.x;
+        boxes_scores_data[boxes_scores_id++] = bbox_det_vec[i].bbox.y;
+        boxes_scores_data[boxes_scores_id++] =
+            bbox_det_vec[i].bbox.w + bbox_det_vec[i].bbox.x;
+        boxes_scores_data[boxes_scores_id++] =
+            bbox_det_vec[i].bbox.h + bbox_det_vec[i].bbox.y;
+        free(bbox_det_vec[i].prob);
+      }
+      boxes_num_data[batch_id] = bbox_det_vec.size();
+    }
+
+#ifdef PADDLE_WITH_HIP
+    hipFree(bbox_index_device_ptr);
+#else
+    cudaFree(bbox_index_device_ptr);
+#endif
+    for (int i = 0; i < batch * boxes_input.size(); i++) {
+#ifdef PADDLE_WITH_HIP
+      hipFree(ts_info[i].bboxes_dev_ptr);
+      hipFree(ts_info[i].bbox_count_device_ptr);
+#else
+      cudaFree(ts_info[i].bboxes_dev_ptr);
+      cudaFree(ts_info[i].bbox_count_device_ptr);
+#endif
+      free(ts_info[i].bboxes_host_ptr);
+    }
+    delete[] ts_info;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(yolo_box_post, ops::YoloBoxPostKernel<float>);
diff --git a/paddle/fluid/operators/gather_op_mlu.cc b/paddle/fluid/operators/gather_op_mlu.cc
index 220d045952643..cf35e051edf87 100644
--- a/paddle/fluid/operators/gather_op_mlu.cc
+++ b/paddle/fluid/operators/gather_op_mlu.cc
@@ -27,11 +27,28 @@ class GatherOpMLUKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto axis = ctx.Attr<int>("axis");
 
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     auto *out = ctx.Output<Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc index_desc(*index);
+    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
+    MLUCnnlTensorDesc index_desc(1, index_shape_1d,
+                                 ToCnnlDataType(index->dtype()));
     MLUCnnlTensorDesc out_desc(*out);
     MLUCnnl::GatherFunctor(ctx, axis, 0 /*batch_dims*/, x_desc.get(),
                            GetBasePtr(x), index_desc.get(), GetBasePtr(index),
@@ -46,6 +63,22 @@ class GatherGradOpMLUKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     dx->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc dx_desc(*dx);
@@ -53,7 +86,9 @@ class GatherGradOpMLUKernel : public framework::OpKernel<T> {
     MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(),
                   GetBasePtr(dx));
 
-    MLUCnnlTensorDesc index_desc(*index);
+    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
+    MLUCnnlTensorDesc index_desc(1, index_shape_1d,
+                                 ToCnnlDataType(index->dtype()));
     MLUCnnlTensorDesc dout_desc(*dout);
     const cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
     MLUCnnl::ScatterFunctor(ctx, dx_desc.get(), GetBasePtr(dx), dout_desc.get(),
diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc
index 8c1fd34ae87d2..f5b817a0e11fa 100644
--- a/paddle/fluid/operators/inverse_op.cc
+++ b/paddle/fluid/operators/inverse_op.cc
@@ -33,21 +33,21 @@ class InverseOp : public framework::OperatorWithKernel {
         input_rank, 2,
         platform::errors::InvalidArgument(
             "The dimension of Input(Input) is expected to be no less than 2. "
-            "But recieved: Input(Input)'s dimension = %d, shape = [%s].",
+            "But received: Input(Input)'s dimension = %d, shape = [%s].",
             input_rank, input_dims));
     for (int64_t i = 0; i < input_rank; ++i) {
       PADDLE_ENFORCE_EQ(
           (input_dims[i] == -1) || (input_dims[i] > 0), true,
           platform::errors::InvalidArgument(
               "Each dimension of input tensor is expected to be -1 or a "
-              "positive number, but recieved %d. Input's shape is [%s].",
+              "positive number, but received %d. Input's shape is [%s].",
               input_dims[i], input_dims));
     }
     if (input_dims[input_rank - 2] > 0 && input_dims[input_rank - 1] > 0) {
       PADDLE_ENFORCE_EQ(input_dims[input_rank - 2], input_dims[input_rank - 1],
                         platform::errors::InvalidArgument(
                             "The last two dimensions are expected to be equal. "
-                            "But recieved: %d and %d; "
+                            "But received: %d and %d; "
                             "Input(Input)'s shape = [%s].",
                             input_dims[input_rank - 2],
                             input_dims[input_rank - 1], input_dims));
diff --git a/paddle/fluid/operators/layer_norm_op_mlu.cc b/paddle/fluid/operators/layer_norm_op_mlu.cc
new file mode 100644
index 0000000000000..a368af86a3da6
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
@@ -0,0 +1,234 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename T>
+class LayerNormMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* variance = ctx.Output<Tensor>("Variance");
+
+    auto place = ctx.GetPlace();
+
+    y->mutable_data<T>(place);
+    mean->mutable_data<T>(place);
+    variance->mutable_data<T>(place);
+
+    const auto& x_dims = x->dims();
+    std::vector<int> scale_bias_axes;
+    std::vector<int> mean_var_axes;
+    for (auto i = 0; i < x_dims.size(); ++i) {
+      if (i >= begin_norm_axis) {
+        scale_bias_axes.push_back(x_dims[i]);
+      } else {
+        mean_var_axes.push_back(x_dims[i]);
+      }
+    }
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc mean_var_desc(mean_var_axes.size(), mean_var_axes.data(),
+                                    ToCnnlDataType<T>());
+    // cnnl only support both of scale and bias is NULL or not.
+    if (!scale && !bias) {
+      MLUCnnl::LayerNormForward(
+          ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x),
+          nullptr /*scale_bias_desc*/, nullptr /*scale*/, nullptr /*bias*/,
+          epsilon, y_desc.get(), GetBasePtr(y), mean_var_desc.get(),
+          GetBasePtr(mean), GetBasePtr(variance));
+    } else {
+      Tensor tmp_scale(x->dtype());
+      if (!scale) {
+        tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+        FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
+      } else {
+        tmp_scale = *scale;
+      }
+
+      Tensor tmp_bias(x->dtype());
+      if (!bias) {
+        tmp_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+        FillMLUTensorWithHostValue(ctx, static_cast<T>(0), &tmp_bias);
+      } else {
+        tmp_bias = *bias;
+      }
+
+      // scale and bias should have same type with x/y
+      MLUCnnlTensorDesc float32_desc(scale_bias_axes.size(),
+                                     scale_bias_axes.data(), CNNL_DTYPE_FLOAT);
+      MLUCnnlTensorDesc float16_desc(scale_bias_axes.size(),
+                                     scale_bias_axes.data(), CNNL_DTYPE_HALF);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::FP32, VT::FP16);
+
+      Tensor final_scale(x->dtype());
+      if (final_scale.dtype() == DataType::FLOAT16 &&
+          tmp_scale.dtype() == DataType::FLOAT32) {
+        final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+        // cast scale to fp16
+        MLUCnnl::Cast(ctx, cast_type, float32_desc.get(),
+                      GetBasePtr(&tmp_scale), float16_desc.get(),
+                      GetBasePtr(&final_scale));
+      } else {
+        final_scale = tmp_scale;
+      }
+
+      Tensor final_bias(x->dtype());
+      if (final_bias.dtype() == DataType::FLOAT16 &&
+          tmp_bias.dtype() == DataType::FLOAT32) {
+        final_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+        // cast bias to fp16
+        MLUCnnl::Cast(ctx, cast_type, float32_desc.get(), GetBasePtr(&tmp_bias),
+                      float16_desc.get(), GetBasePtr(&final_bias));
+      } else {
+        final_bias = tmp_bias;
+      }
+
+      MLUCnnlTensorDesc scale_bias_desc(
+          scale_bias_axes.size(), scale_bias_axes.data(), ToCnnlDataType<T>());
+      MLUCnnl::LayerNormForward(
+          ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x),
+          scale_bias_desc.get(), GetBasePtr(&final_scale),
+          GetBasePtr(&final_bias), epsilon, y_desc.get(), GetBasePtr(y),
+          mean_var_desc.get(), GetBasePtr(mean), GetBasePtr(variance));
+    }
+  }
+};
+
+template <typename T>
+class LayerNormGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* mean = ctx.Input<Tensor>("Mean");
+    const auto* variance = ctx.Input<Tensor>("Variance");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto place = ctx.GetPlace();
+    dx->mutable_data<T>(place);
+
+    const auto& x_dims = x->dims();
+    std::vector<int> scale_bias_axes;
+    std::vector<int> mean_var_axes;
+    for (auto i = 0; i < x_dims.size(); ++i) {
+      if (i >= begin_norm_axis) {
+        scale_bias_axes.push_back(x_dims[i]);
+      } else {
+        mean_var_axes.push_back(x_dims[i]);
+      }
+    }
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc dy_desc(*dy);
+    MLUCnnlTensorDesc mean_var_desc(mean_var_axes.size(), mean_var_axes.data(),
+                                    ToCnnlDataType<T>());
+    MLUCnnlTensorDesc dx_desc(*dx);
+
+    Tensor tmp_scale(x->dtype());
+    if (!scale) {
+      tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
+    } else {
+      tmp_scale = *scale;
+    }
+
+    MLUCnnlTensorDesc float32_desc(scale_bias_axes.size(),
+                                   scale_bias_axes.data(), CNNL_DTYPE_FLOAT);
+    MLUCnnlTensorDesc float16_desc(scale_bias_axes.size(),
+                                   scale_bias_axes.data(), CNNL_DTYPE_HALF);
+    cnnlCastDataType_t cast_fp32_to_fp16 = GetCastDataType(VT::FP32, VT::FP16);
+    cnnlCastDataType_t cast_fp16_to_fp32 = GetCastDataType(VT::FP16, VT::FP32);
+
+    Tensor final_scale(x->dtype());
+    if (final_scale.dtype() == DataType::FLOAT16 &&
+        tmp_scale.dtype() == DataType::FLOAT32) {
+      final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+      // cast scale to fp16
+      MLUCnnl::Cast(ctx, cast_fp32_to_fp16, float32_desc.get(),
+                    GetBasePtr(&tmp_scale), float16_desc.get(),
+                    GetBasePtr(&final_scale));
+    } else {
+      final_scale = tmp_scale;
+    }
+
+    Tensor tmp_dscale(x->dtype());
+    if (dscale && (tmp_dscale.dtype() == dscale->dtype())) {
+      dscale->mutable_data<T>(place);
+      tmp_dscale = *dscale;
+    } else {
+      tmp_dscale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+    }
+    Tensor tmp_dbias(x->dtype());
+    if (dbias && (tmp_dbias.dtype() == dbias->dtype())) {
+      dbias->mutable_data<T>(place);
+      tmp_dbias = *dbias;
+    } else {
+      tmp_dbias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+    }
+
+    MLUCnnlTensorDesc scale_desc(scale_bias_axes.size(), scale_bias_axes.data(),
+                                 ToCnnlDataType<T>());
+    MLUCnnl::LayerNormBackward(
+        ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x), dy_desc.get(),
+        GetBasePtr(dy), scale_desc.get(), GetBasePtr(&final_scale),
+        mean_var_desc.get(), GetBasePtr(mean), GetBasePtr(variance),
+        dx_desc.get(), GetBasePtr(dx), GetBasePtr(&tmp_dscale),
+        GetBasePtr(&tmp_dbias));
+
+    if (dscale && (tmp_dscale.dtype() == DataType::FLOAT16 &&
+                   dscale->dtype() == DataType::FLOAT32)) {
+      dscale->mutable_data<T>(place);
+      MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(),
+                    GetBasePtr(&tmp_dscale), float32_desc.get(),
+                    GetBasePtr(dscale));
+    }
+    if (dbias && (tmp_dbias.dtype() == DataType::FLOAT16 &&
+                  dbias->dtype() == DataType::FLOAT32)) {
+      dbias->mutable_data<T>(place);
+      MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(),
+                    GetBasePtr(&tmp_dbias), float32_desc.get(),
+                    GetBasePtr(dbias));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(layer_norm, ops::LayerNormMLUKernel<float>,
+                       ops::LayerNormMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(layer_norm_grad, ops::LayerNormGradMLUKernel<float>,
+                       ops::LayerNormGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 1cd59672f97fc..e9375be1706eb 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -38,8 +38,11 @@ class LinspaceOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
+    if (platform::is_xpu_place(tensor.place())) {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+    return expected_kernel_type;
   }
 };
 
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 86327a4f2c13a..642c8bcd9ae49 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -77,7 +77,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument(
             "The last value of 'Target LoD''s last level LoD should be equal "
             "to the first dimension of Input(X). But received the 'Target LoD' "
-            "is %s, Input(X)'s shape is is %s.",
+            "is %s, Input(X)'s shape is %s.",
             phi::make_ddim(level0), in->dims()));
     for (size_t i = 0; i < level0.size() - 1; ++i) {
       PADDLE_ENFORCE_GE(level0[i + 1], level0[i],
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index aa5fdd86745d6..fee1f56ebdcf2 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #include <memory>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
@@ -28,15 +29,9 @@ class LogLossXPUKernel : public framework::OpKernel<T> {
     loss->mutable_data<T>(ctx.GetPlace());
     int n = predict->numel();
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r =
-        xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon, predict->data<T>(),
-                          labels->data<T>(), loss->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU log_loss kernel return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    int r = xpu::log_loss(dev_ctx.x_context(), predict->data<T>(),
+                          labels->data<T>(), loss->data<T>(), n, epsilon);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "log_loss");
   }
 };
 template <typename DeviceContext, typename T, typename AttrType = T>
@@ -54,15 +49,10 @@ class LogLossGradXPUKernel : public framework::OpKernel<T> {
     dpred->mutable_data<T>(ctx.GetPlace());
     int n = predict->numel();
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon,
-                              predict->data<T>(), labels->data<T>(),
-                              dloss->data<T>(), dpred->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU log_loss kernel return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    int r = xpu::log_loss_grad(dev_ctx.x_context(), predict->data<T>(),
+                               labels->data<T>(), dloss->data<T>(),
+                               dpred->data<T>(), n, epsilon);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "log_loss_grad");
   }
 };
 
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index e339be06d69ed..da7340e4eb0b3 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -38,7 +38,7 @@ struct TolerableValue {
 // NOTE(dzh): float16 value clip behave different.
 // 1. Our ValueClipping has a  hardcore threshold 1e20
 // for float number. 1e20 will resulting in overflow in float16.
-// 2. float16 should expose the the real number overflow to python.
+// 2. float16 should expose the real number overflow to python.
 // because mixed-training depends the inf/nan value to determine
 // if the scale value will be adjusted.
 // Also. In standard implementation of cross entropy, other
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index f77287826ffb3..a880afb0e9be3 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -32,7 +32,7 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2.height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height  = "
+                                          "But received first input height  = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2.height()));
     output->set_height(in1_height);
@@ -56,27 +56,27 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
         in1_row_numel, in2_value.numel() / in2_rows.size(),
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, in2_value.numel() / in2_rows.size()));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, out_value->numel() / out_rows.size(),
         platform::errors::InvalidArgument(
             "The input and oupput width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto in2_place = input2.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto out_place = context.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
@@ -103,14 +103,14 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -121,13 +121,13 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
@@ -161,7 +161,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2->height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2->height()));
 
@@ -178,11 +178,11 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto in2_place = input2->place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -211,7 +211,7 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
       PADDLE_ENFORCE_EQ(in1_height, input2->height(),
                         platform::errors::InvalidArgument(
                             "The two inputs height must be equal."
-                            "But recieved first input height = [%d], second "
+                            "But received first input height = [%d], second "
                             "input height = [%d]",
                             in1_height, input2->height()));
     }
@@ -253,7 +253,7 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -265,7 +265,7 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -293,7 +293,7 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -305,7 +305,7 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -842,7 +842,7 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -854,7 +854,7 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 542d4c9784352..db5c66d319701 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -33,7 +33,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2.height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height  = "
+                                          "But received first input height  = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2.height()));
     output->set_height(in1_height);
@@ -57,13 +57,13 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
         in1_row_numel, in2_value.numel() / in2_rows.size(),
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, in2_value.numel() / in2_rows.size()));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, out_value->numel() / out_rows.size(),
         platform::errors::InvalidArgument(
             "The input and oupput width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto* out_data = out_value->data<T>();
@@ -72,15 +72,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto in2_place = input2.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto out_place = context.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
 
     memory::Copy(out_place, out_data, in1_place, in1_data,
                  in1_value.numel() * sizeof(T), context.stream());
@@ -126,13 +126,13 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument(
             "The two inputs height must be equal."
-            "But recieved first input height = [%d], first input height = [%d]",
+            "But received first input height = [%d], first input height = [%d]",
             in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -143,13 +143,13 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -186,13 +186,13 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument(
             "The two inputs height must be equal."
-            "But recieved first input height = [%d], first input height = [%d]",
+            "But received first input height = [%d], first input height = [%d]",
             in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -203,13 +203,13 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -254,7 +254,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2->height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2->height()));
 
@@ -273,11 +273,11 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto in2_place = input2->place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -322,7 +322,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -334,7 +334,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -359,7 +359,7 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -371,7 +371,7 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -675,7 +675,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -687,7 +687,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.template data<T>();
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 32ef052119883..ed58c90e17022 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -36,7 +36,7 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     // TODO(typhoonzero): support both inference value and indices.
     AddInput("Out", "The network output of topk (inferences)");
-    AddInput("Indices", "The the network output of topk (indices)");
+    AddInput("Indices", "The network output of topk (indices)");
     AddInput("Label", "Label of the training data");
     // TODO(typhoonzero): AddInput("Weight", ...
     AddOutput("Accuracy", "The accuracy of current batch");
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index 3cc1be4de8a82..82e4b90468a38 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -42,68 +42,26 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
     if (num_samples == 0) {
       return;
     }
-    size_t indices_int32_size = num_samples * class_dim * sizeof(int);
-    size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t);
-    size_t label_int32_size = num_samples * sizeof(int);
-    size_t label_int64_size = num_samples * sizeof(int64_t);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int* indices_int32_device = NULL;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&indices_int32_device),
-                   indices_int32_size),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted(
-            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
-            " on XPU. \n\nPlease check whether there is any other process "
-            "using XPU.\n",
-            string::HumanReadableSize(indices_int32_size)));
-    int* label_int32_device = NULL;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&label_int32_device),
-                   label_int32_size),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted(
-            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
-            " on XPU. \n\nPlease check whether there is any other process "
-            "using XPU.\n",
-            string::HumanReadableSize(label_int32_size)));
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int size = num_samples * class_dim;
+    int* indices_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int32_ptr);
+    int* label_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(label_int32_ptr);
 
-    int* indices_int32_host =
-        reinterpret_cast<int*>(std::malloc(indices_int32_size));
-    int64_t* indices_int64_host =
-        reinterpret_cast<int64_t*>(std::malloc(indices_int64_size));
-    int* label_int32_host =
-        reinterpret_cast<int*>(std::malloc(label_int32_size));
-    int64_t* label_int64_host =
-        reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
-    dev_ctx.Wait();
-    memory::Copy(platform::CPUPlace(), indices_int64_host, ctx.GetPlace(),
-                 indices_data, indices_int64_size);
-    memory::Copy(platform::CPUPlace(), label_int64_host, ctx.GetPlace(),
-                 label_data, label_int64_size);
-    for (size_t i = 0; i < num_samples; ++i) {
-      label_int32_host[i] = label_int64_host[i];
-      for (size_t j = 0; j < class_dim; ++j) {
-        indices_int32_host[i * class_dim + j] =
-            indices_int64_host[i * class_dim + j];
-      }
-    }
-    memory::Copy(ctx.GetPlace(), indices_int32_device, platform::CPUPlace(),
-                 indices_int32_host, indices_int32_size);
-    memory::Copy(ctx.GetPlace(), label_int32_device, platform::CPUPlace(),
-                 label_int32_host, label_int32_size);
-    int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
-                          label_int32_device, num_samples, class_dim,
-                          correct_data, total_data, accuracy_data);
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU accuracy kernel error!"));
-    dev_ctx.Wait();
-    xpu_free(indices_int32_device);
-    xpu_free(label_int32_device);
-    std::free(indices_int32_host);
-    std::free(indices_int64_host);
-    std::free(label_int32_host);
-    std::free(label_int64_host);
+    int r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), indices_data,
+                                           indices_int32_ptr, size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+
+    r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), label_data,
+                                       label_int32_ptr, size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+
+    r = xpu::accuracy(dev_ctx.x_context(), indices_int32_ptr, label_int32_ptr,
+                      num_samples, class_dim, correct_data, total_data,
+                      accuracy_data);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 67d1aaa4baf52..fba17d303f282 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -223,9 +223,17 @@ class ConvMKLDNNHandlerT
       float sum_scale = 1.0f;
       float activation_scale = 1.0f;
       std::vector<float> output_shift_scale;
-      if (platform::is_int8<T>())
-        std::tie(sum_scale, output_shift_scale, activation_scale) =
-            get_int8_scales(ctx);
+      if (platform::is_int8<T>()) {
+        if (ctx.HasAttr("Sum_scale")) {
+          sum_scale = ctx.Attr<float>("Sum_scale");
+          activation_scale = ctx.Attr<float>("Activation_scale");
+          output_shift_scale =
+              ctx.Attr<std::vector<float>>("Output_shift_scale");
+        } else {
+          std::tie(sum_scale, output_shift_scale, activation_scale) =
+              get_int8_scales(ctx);
+        }
+      }
 
       const dnnl::primitive_attr conv_attr = CreatePostOps(
           fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
@@ -872,8 +880,18 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
         {DNNL_ARG_DST, *dst_memory_p}};
 
     if (bias) {
-      auto p_scales_tuple = handler.get_int8_bias_scales(ctx);
-
+      std::vector<float> bias_scales;
+      auto p_scales_tuple =
+          std::make_shared<std::tuple<float, std::vector<float>>>(
+              std::make_tuple(static_cast<float>(mask_reorder), bias_scales));
+      if (ctx.HasAttr("Bias_scales")) {
+        bias_scales = ctx.Attr<std::vector<float>>("Bias_scales");
+        p_scales_tuple =
+            std::make_shared<std::tuple<float, std::vector<float>>>(
+                std::make_tuple(static_cast<float>(mask_reorder), bias_scales));
+      } else {
+        p_scales_tuple = handler.get_int8_bias_scales(ctx);
+      }
       auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
           bias, true, std::get<1>(*p_scales_tuple),
           std::get<0>(*p_scales_tuple));
diff --git a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
new file mode 100644
index 0000000000000..cfc320da47fff
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+class FillConstantMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
+ public:
+  FillConstantMKLDNNHandler(Tensor* out, dnnl::engine engine,
+                            platform::Place cpu_place)
+      : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
+    const auto src0_md = dnnl::memory::desc(
+        {out->numel(), sizeof(T)}, platform::MKLDNNGetDataType<uint8_t>(),
+        dnnl::memory::format_tag::ab);
+
+    dnnl::primitive_attr attrs;
+    attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f});
+
+    this->AcquireForwardPrimitiveDescriptor(attrs, dnnl::algorithm::binary_add,
+                                            src0_md, src1_md, src0_md);
+  }
+
+  static const dnnl::memory::desc src1_md;
+};
+
+template <typename T>
+const dnnl::memory::desc FillConstantMKLDNNHandler<T>::src1_md(
+    {1, sizeof(T)}, platform::MKLDNNGetDataType<uint8_t>(),
+    dnnl::memory::format_tag::ab);
+
+template <typename T>
+class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& dnnl_engine = dev_ctx.GetEngine();
+
+    auto* out = ctx.Output<Tensor>("Out");
+    T fill_value = CalculateFillValue(ctx);
+
+    auto shape = GetShape(ctx);
+    out->Resize(shape);
+
+    FillConstantMKLDNNHandler<T> handler(out, dnnl_engine, ctx.GetPlace());
+
+    dnnl::memory constant_value_memory =
+        dnnl::memory(FillConstantMKLDNNHandler<T>::src1_md, dnnl_engine,
+                     reinterpret_cast<uint8_t*>(&fill_value));
+
+    auto src0_memory_p = handler.AcquireDstMemory(out);
+    auto fill_constant_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    fill_constant_p->execute(astream, {{DNNL_ARG_SRC_0, *src0_memory_p},
+                                       {DNNL_ARG_SRC_1, constant_value_memory},
+                                       {DNNL_ARG_DST, *src0_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetPlainMKLDNNFormat(out->dims().size()));
+  }
+
+  T CalculateFillValue(const framework::ExecutionContext& ctx) const {
+    const auto str_value = ctx.Attr<std::string>("str_value");
+    const auto float_value = ctx.Attr<float>("value");
+
+    T value;
+
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      // handle NaN/Inf first, which cannot be read from stream
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<float>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<float>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        double tmp_value;
+        convert_stream >> tmp_value;
+        value = static_cast<T>(tmp_value);
+      }
+    }
+
+    if (ctx.HasInput("ValueTensor")) {
+      const auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
+      PADDLE_ENFORCE_EQ(
+          value_tensor->numel(), 1,
+          platform::errors::InvalidArgument(
+              "When use Tensor as value to set Tensor value in fill_constant, "
+              "value input(ValueTensor) size must be 1, but got %d",
+              value_tensor->numel()));
+      value = value_tensor->data<T>()[0];
+    }
+
+    return value;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(fill_constant, MKLDNN, paddle::platform::CPUPlace,
+                   ops::FillConstantMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
index c471ba62f609b..3ebfbdc50caab 100644
--- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
@@ -1,2 +1 @@
-cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op activation_op pooling transpose_op scope device_context enforce executor)
-
+cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op activation_op pooling transpose_op scope device_context enforce executor)
diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
index f04c73ec0b249..517f782e18758 100644
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
@@ -32,6 +32,16 @@ class ShapeMKLDNNKernel : public framework::OpKernel<T> {
       in_dims = in_var->Get<phi::SelectedRows>().value().dims();
     } else {
       in_dims = in_var->Get<LoDTensor>().dims();
+      // Output of shape op is often fed as input to fill_constant ops
+      // and we need to rotate a shape otherwise Tensors of wrong shape may be
+      // allocated
+      if (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
+              framework::DataLayout::kNHWC &&
+          in_dims.size() >= 3) {
+        auto rdims = phi::vectorize<int>(in_dims);
+        std::rotate(rdims.begin() + 1, rdims.begin() + 2, rdims.end());
+        in_dims = phi::make_ddim(rdims);
+      }
     }
     auto* out_t = ctx.Output<Tensor>("Out");
     out_t->Resize({in_dims.size()});
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 0e988557df626..4ff93ee3cd624 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -32,9 +32,12 @@ USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
+USE_OP_ITSELF(shape);
+USE_OP_DEVICE_KERNEL(shape, MKLDNN);
 
 PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(shape, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace operators {
@@ -154,5 +157,59 @@ TEST(test_pool2d_relu_relu_nhwc, cpu_place) {
                     platform::errors::InvalidArgument(
                         "Computed shape does not match expected shape"));
 }
+
+TEST(test_pool2d_shape_nhwc, cpu_place) {
+  framework::DDim dims({1, 4, 8, 512});              // NHWC shape
+  std::vector<int32_t> expected_dims{1, 3, 7, 512};  // NHWC expected shape
+  platform::CPUPlace p;
+  framework::Scope scope;
+
+  InputVars input_name = {"x",
+                          scope.Var("x")->GetMutable<framework::LoDTensor>()};
+  // Initialize input data
+  std::uniform_real_distribution<float> dist(static_cast<float>(10.0),
+                                             static_cast<float>(20.0));
+  std::mt19937 engine;
+  size_t numel = static_cast<size_t>(phi::product(dims));
+  input_name.tensor->Resize(dims);
+  auto data_ptr = input_name.tensor->mutable_data<float>(p);
+  for (size_t i = 0; i < numel; ++i) {
+    data_ptr[i] = dist(engine);
+  }
+
+  scope.Var("y")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
+
+  auto &pool = platform::DeviceContextPool::Instance();
+
+  // Make pool2d followed by shape. shape for NHWC should return
+  // as output tensor not-rotated shape of Pool (
+
+  auto ksize = std::vector<int>(2, 2);
+  auto op_pool = framework::OpRegistry::CreateOp(
+      "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}},
+      {{"pooling_type", {std::string("max")}},
+       {"ksize", {ksize}},
+       {"data_format", {std::string("NHWC")}},
+       {"use_mkldnn", {true}}});
+
+  auto op_shape = framework::OpRegistry::CreateOp(
+      "shape", {{"Input", {"y"}}}, {{"Out", {"z"}}}, {{"use_mkldnn", {true}}});
+
+  op_pool->Run(scope, p);
+  op_shape->Run(scope, p);
+
+  pool.Get(p)->Wait();
+
+  // repack tensor data into vector for easy comparison
+  auto *zdata = z->data<int32_t>();
+  std::vector<int32_t> vzdata(zdata, zdata + z->numel());
+
+  // Verify shape of output
+  PADDLE_ENFORCE_EQ(vzdata, expected_dims,
+                    platform::errors::InvalidArgument(
+                        "Computed shape does not match expected shape"));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index eacab46800580..867c5f212ba6c 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -688,8 +688,9 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
     const cnnlTensorDescriptor_t diff_y_desc, void* back_out) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSparseSoftmaxCrossEntropyWithLogits(
-      handle, mode, x_desc, input, label_desc, label, y_desc, output,
+  const cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSparseSoftmaxCrossEntropyWithLogits_v2(
+      handle, prefer, mode, x_desc, input, label_desc, label, y_desc, output,
       diff_y_desc, back_out));
 }
 
@@ -697,14 +698,14 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                   const bool exclusive, const bool reverse,
                                   const cnnlTensorDescriptor_t input_desc,
                                   const void* input,
-                                  const cnnlTensorDescriptor_t ouput_desc,
+                                  const cnnlTensorDescriptor_t output_desc,
                                   void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   // NAN propagation mode: Only support CNNL_NOT_PROPAGATE_NAN now.
   cnnlNanPropagation_t mode = CNNL_NOT_PROPAGATE_NAN;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlCumsum(handle, input_desc, input, axis,
-                                        exclusive, reverse, mode, ouput_desc,
+                                        exclusive, reverse, mode, output_desc,
                                         output));
 }
 
@@ -805,17 +806,17 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
 }
 
 /* static */ void MLUCnnl::ApplyAdam(
-    const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
-    const void* grad, const void* lr, const void* beta1, const void* beta2,
-    const void* beta1_power, const void* beta2_power, const void* epsilon,
-    const bool use_nesterov, const cnnlTensorDescriptor_t var_desc, void* var,
-    const cnnlTensorDescriptor_t m_desc, void* m,
-    const cnnlTensorDescriptor_t v_desc, void* v) {
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t var_desc,
+    void* var, const cnnlTensorDescriptor_t m_desc, void* m,
+    const cnnlTensorDescriptor_t v_desc, void* v,
+    const cnnlTensorDescriptor_t grad_desc, const void* grad, const void* lr,
+    const void* beta1, const void* beta2, const void* beta1_power,
+    const void* beta2_power, const void* epsilon, const bool use_nesterov) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyAdam(
-      handle, grad_desc, var, grad_desc, m, grad_desc, v, grad_desc, grad, lr,
-      beta1, beta2, beta1_power, beta2_power, epsilon, use_nesterov));
+      handle, var_desc, var, m_desc, m, v_desc, v, grad_desc, grad, lr, beta1,
+      beta2, beta1_power, beta2_power, epsilon, use_nesterov));
 }
 
 /* static */ void MLUCnnl::ApplyAdaMax(
@@ -2077,6 +2078,45 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
   }
 }
 
+/* static */ void MLUCnnl::LayerNormForward(
+    const ExecutionContext& ctx, int axis, const cnnlTensorDescriptor_t x_desc,
+    const void* x, const cnnlTensorDescriptor_t weight_bias_desc,
+    const void* weight, const void* bias, float eps,
+    const cnnlTensorDescriptor_t y_desc, void* y,
+    const cnnlTensorDescriptor_t mean_rstd_desc, void* saved_mean,
+    void* saved_rstd) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetLayerNormOpWorkspaceSize(handle, axis, x_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlLayerNormForward(handle, x_desc, x, axis, weight_bias_desc, weight,
+                           bias, eps, workspace_ptr, workspace_size, y_desc, y,
+                           mean_rstd_desc, saved_mean, saved_rstd));
+}
+
+/* static */ void MLUCnnl::LayerNormBackward(
+    const ExecutionContext& ctx, int axis, const cnnlTensorDescriptor_t x_desc,
+    const void* x, const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z,
+    const cnnlTensorDescriptor_t weight_bias_desc, const void* weight,
+    const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean,
+    const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc,
+    void* diff_x, void* diff_weight, void* diff_bias) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlLayerNormBackward(
+      handle, x_desc, x, axis, diff_z_desc, diff_z, weight_bias_desc, weight,
+      mean_rstd_desc, saved_mean, saved_rstd, diff_x_desc, diff_x, diff_weight,
+      diff_bias));
+}
+
 /* static */ void MLUCnnl::QuantizeParam(
     const ExecutionContext& ctx, const cnnlQuantizeMode_t mode,
     const int bitwidth, const cnnlTensorDescriptor_t input_desc,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 572b7aa2bbd01..24db6c760d78a 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -146,10 +146,8 @@ const std::map<std::pair<VT::Type, VT::Type>, cnnlCastDataType_t>
         {{VT::FP16, /*cast to*/ VT::BOOL}, CNNL_CAST_HALF_TO_BOOL},
         {{VT::INT32, /*cast to*/ VT::FP32}, CNNL_CAST_INT32_TO_FLOAT},
         {{VT::INT32, /*cast to*/ VT::FP16}, CNNL_CAST_INT32_TO_HALF},
-        {{VT::INT32, /*cast to*/ VT::INT64}, CNNL_CAST_INT32_TO_INT64},
-        {{VT::INT32, /*cast to*/ VT::INT16}, CNNL_CAST_INT32_TO_INT16},
         {{VT::INT32, /*cast to*/ VT::INT8}, CNNL_CAST_INT32_TO_INT8},
-        {{VT::INT32, /*cast to*/ VT::BOOL}, CNNL_CAST_INT32_TO_BOOL},
+        {{VT::INT32, /*cast to*/ VT::INT16}, CNNL_CAST_INT32_TO_INT16},
         {{VT::INT16, /*cast to*/ VT::FP32}, CNNL_CAST_INT16_TO_FLOAT},
         {{VT::INT16, /*cast to*/ VT::FP16}, CNNL_CAST_INT16_TO_HALF},
         {{VT::INT16, /*cast to*/ VT::INT32}, CNNL_CAST_INT16_TO_INT32},
@@ -158,12 +156,21 @@ const std::map<std::pair<VT::Type, VT::Type>, cnnlCastDataType_t>
         {{VT::INT8, /*cast to*/ VT::INT32}, CNNL_CAST_INT8_TO_INT32},
         {{VT::UINT8, /*cast to*/ VT::FP32}, CNNL_CAST_UINT8_TO_FLOAT},
         {{VT::UINT8, /*cast to*/ VT::FP16}, CNNL_CAST_UINT8_TO_HALF},
-        {{VT::UINT8, /*cast to*/ VT::INT64}, CNNL_CAST_UINT8_TO_INT64},
-        {{VT::UINT8, /*cast to*/ VT::INT32}, CNNL_CAST_UINT8_TO_INT32},
         {{VT::BOOL, /*cast to*/ VT::FP32}, CNNL_CAST_BOOL_TO_FLOAT},
         {{VT::BOOL, /*cast to*/ VT::FP16}, CNNL_CAST_BOOL_TO_HALF},
         {{VT::BOOL, /*cast to*/ VT::INT32}, CNNL_CAST_BOOL_TO_INT32},
+        {{VT::UINT8, /*cast to*/ VT::INT32}, CNNL_CAST_UINT8_TO_INT32},
+        {{VT::INT32, /*cast to*/ VT::INT64}, CNNL_CAST_INT32_TO_INT64},
         {{VT::INT64, /*cast to*/ VT::INT32}, CNNL_CAST_INT64_TO_INT32},
+        {{VT::INT32, /*cast to*/ VT::BOOL}, CNNL_CAST_INT32_TO_BOOL},
+        {{VT::UINT8, /*cast to*/ VT::INT64}, CNNL_CAST_UINT8_TO_INT64},
+        {{VT::INT8, /*cast to*/ VT::INT16}, CNNL_CAST_INT8_TO_INT16},
+        {{VT::FP32, /*cast to*/ VT::FP64}, CNNL_CAST_FLOAT_TO_DOUBLE},
+        {{VT::FP64, /*cast to*/ VT::FP32}, CNNL_CAST_DOUBLE_TO_FLOAT},
+        {{VT::INT64, /*cast to*/ VT::FP32}, CNNL_CAST_INT64_TO_FLOAT},
+        {{VT::INT64, /*cast to*/ VT::FP16}, CNNL_CAST_INT64_TO_HALF},
+        {{VT::FP32, /*cast to*/ VT::INT64}, CNNL_CAST_FLOAT_TO_INT64},
+        {{VT::FP16, /*cast to*/ VT::INT64}, CNNL_CAST_HALF_TO_INT64},
 };
 
 cnnlCastDataType_t GetCastDataType(const VT::Type& src_type,
@@ -496,14 +503,14 @@ class MLUCnnl {
       const cnnlTensorDescriptor_t mom_desc, void* mom);
 
   static void ApplyAdam(const ExecutionContext& ctx,
+                        const cnnlTensorDescriptor_t var_desc, void* var,
+                        const cnnlTensorDescriptor_t m_desc, void* m,
+                        const cnnlTensorDescriptor_t v_desc, void* v,
                         const cnnlTensorDescriptor_t grad_desc,
                         const void* grad, const void* lr, const void* beta1,
                         const void* beta2, const void* beta1_power,
                         const void* beta2_power, const void* epsilon,
-                        const bool use_nesterov,
-                        const cnnlTensorDescriptor_t var_desc, void* var,
-                        const cnnlTensorDescriptor_t m_desc, void* m,
-                        const cnnlTensorDescriptor_t v_desc, void* v);
+                        const bool use_nesterov);
 
   static void ApplyAdaMax(const ExecutionContext& ctx,
                           const cnnlTensorDescriptor_t grad_desc,
@@ -1103,6 +1110,24 @@ class MLUCnnl {
       const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop,
       void* scale_backprop, void* offset_backprop);
 
+  static void LayerNormForward(const ExecutionContext& ctx, int axis,
+                               const cnnlTensorDescriptor_t x_desc,
+                               const void* x,
+                               const cnnlTensorDescriptor_t weight_bias_desc,
+                               const void* weight, const void* bias, float eps,
+                               const cnnlTensorDescriptor_t y_desc, void* y,
+                               const cnnlTensorDescriptor_t mean_rstd_desc,
+                               void* saved_mean, void* saved_rstd);
+
+  static void LayerNormBackward(
+      const ExecutionContext& ctx, int axis,
+      const cnnlTensorDescriptor_t x_desc, const void* x,
+      const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z,
+      const cnnlTensorDescriptor_t weight_bias_desc, const void* weight,
+      const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean,
+      const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc,
+      void* diff_x, void* diff_weight, void* diff_bias);
+
   static void Transpose(const ExecutionContext& ctx,
                         const std::vector<int> perm, const int input_dim,
                         const cnnlTensorDescriptor_t input_desc,
@@ -1230,5 +1255,13 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
                      GetBasePtr(transformed_output));
 }
 
+template <typename T>
+inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx, T value,
+                                       Tensor* out) {
+  MLUCnnlTensorDesc out_desc(*out);
+  MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(),
+                GetBasePtr(out));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc
new file mode 100644
index 0000000000000..316554e98f01e
--- /dev/null
+++ b/paddle/fluid/operators/multinomial_op_npu.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in
+// cmake/operators.cmake when Paddle supports
+#if (CANN_VERSION_CODE >= 504000)
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class NPUMultinomialKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    const int64_t num_samples = ctx.Attr<int>("num_samples");
+    const bool replacement = ctx.Attr<bool>("replacement");
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    out->mutable_data<int64_t>(place);
+
+    const auto& runner = NpuOpRunner(
+        "MultinomialWithReplacementD", {*x}, {*out},
+        {{"num_samples", num_samples}, {"replacement", replacement}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    multinomial,
+    ops::NPUMultinomialKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::NPUMultinomialKernel<paddle::platform::NPUDeviceContext, double>)
+#endif
diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
new file mode 100644
index 0000000000000..9d335021234eb
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -0,0 +1,285 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class AdamMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
+    auto* param = ctx.Input<LoDTensor>("Param");
+    auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Grad(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(param_var->Type())));
+    auto* grad = ctx.Input<LoDTensor>("Grad");
+    auto* mom1 = ctx.Input<LoDTensor>("Moment1");
+    auto* mom2 = ctx.Input<LoDTensor>("Moment2");
+    auto* lr = ctx.Input<LoDTensor>("LearningRate");
+
+    auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
+
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
+    auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
+    auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+
+    bool skip_update = false;
+    if (ctx.HasInput("SkipUpdate")) {
+      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(SkipUpdate) size must be 1, but get %d",
+                            skip_update_tensor->numel()));
+      std::vector<bool> skip_update_vec;
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
+      skip_update = skip_update_vec[0];
+    }
+    // skip_update=true, just copy input to output, and TensorCopy will call
+    // mutable_data
+    if (skip_update) {
+      VLOG(4) << "Adam skip update";
+      framework::TensorCopy(
+          *param, ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(), param_out);
+      framework::TensorCopy(
+          *mom1, ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(), mom1_out);
+      framework::TensorCopy(
+          *mom2, ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(), mom2_out);
+      framework::TensorCopy(
+          *beta1_pow, beta1_pow->place(),
+          ctx.template device_context<platform::MLUDeviceContext>(),
+          beta1_pow_out);
+      framework::TensorCopy(
+          *beta2_pow, beta2_pow->place(),
+          ctx.template device_context<platform::MLUDeviceContext>(),
+          beta2_pow_out);
+      return;
+    }
+
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    param_out->ShareDataWith(*param);
+    mom1_out->ShareDataWith(*mom1);
+    mom2_out->ShareDataWith(*mom2);
+
+    LoDTensor beta1_pow_tmp;
+    LoDTensor beta2_pow_tmp;
+    if (beta1_pow->place() == platform::CPUPlace()) {
+      T beta1 = *beta1_pow->data<T>();
+      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta1,
+                    beta1_pow_tmp_desc.get(), GetBasePtr(&beta1_pow_tmp));
+      beta1_pow = &beta1_pow_tmp;
+    }
+    if (beta2_pow->place() == platform::CPUPlace()) {
+      T beta2 = *beta2_pow->data<T>();
+      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta2,
+                    beta2_pow_tmp_desc.get(), GetBasePtr(&beta2_pow_tmp));
+      beta2_pow = &beta2_pow_tmp;
+    }
+
+    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
+            << "beta2_pow.numel() : " << beta2_pow->numel();
+    VLOG(3) << "param.numel(): " << param->numel();
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    const Tensor* beta1_tensor = nullptr;
+    const Tensor* beta2_tensor = nullptr;
+    const Tensor* epsilon_tensor = nullptr;
+
+    Tensor beta1_tmp(experimental::DataType::FLOAT32);
+    Tensor beta2_tmp(experimental::DataType::FLOAT32);
+    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+
+    if (ctx.HasInput("Beta1Tensor")) {
+      beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta1Tensor) size must be 1, but get %d",
+                            beta1_tensor->numel()));
+    } else {
+      T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+      beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta1, beta1_tmp_desc.get(),
+                    GetBasePtr(&beta1_tmp));
+      beta1_tensor = &beta1_tmp;
+    }
+
+    if (ctx.HasInput("Beta2Tensor")) {
+      beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta2Tensor) size must be 1, but get %d",
+                            beta2_tensor->numel()));
+    } else {
+      T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+      beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta2, beta2_tmp_desc.get(),
+                    GetBasePtr(&beta2_tmp));
+      beta2_tensor = &beta2_tmp;
+    }
+
+    if (ctx.HasInput("EpsilonTensor")) {
+      epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(EpsilonTensor) size must be 1, but get %d",
+                            epsilon_tensor->numel()));
+    } else {
+      T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+      epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &epsilon,
+                    epsilon_tmp_desc.get(), GetBasePtr(&epsilon_tmp));
+      epsilon_tensor = &epsilon_tmp;
+    }
+
+    MLUCnnlTensorDesc param_desc(*param);
+    MLUCnnlTensorDesc mom1_desc(*mom1);
+    MLUCnnlTensorDesc mom2_desc(*mom2);
+    MLUCnnlTensorDesc grad_desc(*grad);
+    MLUCnnl::ApplyAdam(ctx, param_desc.get(), GetBasePtr(param_out),
+                       mom1_desc.get(), GetBasePtr(mom1_out), mom2_desc.get(),
+                       GetBasePtr(mom2_out), grad_desc.get(), GetBasePtr(grad),
+                       GetBasePtr(lr), GetBasePtr(beta1_tensor),
+                       GetBasePtr(beta2_tensor), GetBasePtr(beta1_pow),
+                       GetBasePtr(beta2_pow), GetBasePtr(epsilon_tensor),
+                       /*use_nesterov*/ false);
+
+    if (!use_global_beta_pow) {
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+
+      MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
+      MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                      CNNL_NOT_PROPAGATE_NAN);
+
+      MLUCnnl::OpTensor(ctx, mul_op_desc.get(), beta1_desc.get(),
+                        GetBasePtr(beta1_pow), beta1_desc.get(),
+                        GetBasePtr(beta1_tensor), beta1_desc.get(),
+                        GetBasePtr(beta1_pow_out), ToCnnlDataType<T>());
+
+      MLUCnnl::OpTensor(ctx, mul_op_desc.get(), beta1_desc.get(),
+                        GetBasePtr(beta2_pow), beta1_desc.get(),
+                        GetBasePtr(beta2_tensor), beta1_desc.get(),
+                        GetBasePtr(beta2_pow_out), ToCnnlDataType<T>());
+    }
+  }
+};
+
+template <typename T>
+class AdamWMLUKernel : public AdamMLUKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    VLOG(3) << "MLU AdamW Kernel";
+    bool skip_update = false;
+    if (ctx.HasInput("SkipUpdate")) {
+      VLOG(3) << "Has SkipUpdate";
+      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(SkipUpdate) size must be 1, but get %d",
+                            skip_update_tensor->numel()));
+      std::vector<bool> skip_update_vec;
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
+      skip_update = skip_update_vec[0];
+    }
+    VLOG(3) << "Skip update" << skip_update;
+    bool with_decay = ctx.Attr<bool>("with_decay");
+    if (!skip_update && with_decay) {
+      if (ctx.HasInput("MasterParam")) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Master Param is not supported on MLU"));
+      } else {
+        const auto* param_var = ctx.InputVar("Param");
+        PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                          platform::errors::InvalidArgument(
+                              "The Var(%s)'s type should be LoDTensor, "
+                              "but the received is %s",
+                              ctx.InputNames("Param").front(),
+                              framework::ToTypeName(param_var->Type())));
+        auto* param = ctx.Input<LoDTensor>("Param");
+        auto* lr = ctx.Input<LoDTensor>("LearningRate");
+        float coeff = ctx.Attr<float>("coeff");
+
+        // update param with decay coeff: mul(-1 * lr, coeff * param) + param
+        MLUCnnlTensorDesc lr_desc(*lr);
+        MLUCnnlTensorDesc param_desc(*param);
+        MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                        CNNL_NOT_PROPAGATE_NAN);
+
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), lr_desc.get(), GetBasePtr(lr),
+                          param_desc.get(), GetBasePtr(param), param_desc.get(),
+                          const_cast<void*>(GetBasePtr(param)),
+                          ToCnnlDataType<T>(),
+                          /*alpha1*/ -1.f, /*alpha2*/ coeff, /*beta*/ 1.f);
+      }
+    }
+    AdamMLUKernel<T>::Compute(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(adam, ops::AdamMLUKernel<float>,
+                       ops::AdamMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(adamw, ops::AdamWMLUKernel<float>,
+                       ops::AdamWMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
index e7cbe4aa8dd4b..7aa5783a01bfd 100644
--- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/optimizers/lamb_op.h"
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -70,44 +71,18 @@ class LambOpXPUKernel : public framework::OpKernel<T> {
 
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto& grad = *ctx.Input<LoDTensor>("Grad");
-      int r = xpu::lamb(dev_ctx.x_context(), grad.template data<T>(),
-                        mom1.template data<T>(), mom2.template data<T>(),
-                        param.template data<T>(), beta1_pow.template data<T>(),
-                        beta2_pow.template data<T>(), beta1, beta2, epsilon,
-                        weight_decay, lr.template data<T>(),
-                        mom1_out.template mutable_data<T>(ctx.GetPlace()),
-                        mom2_out.template mutable_data<T>(ctx.GetPlace()),
-                        param_out.template mutable_data<T>(ctx.GetPlace()),
-                        beta1_pow_out.template mutable_data<T>(ctx.GetPlace()),
-                        beta2_pow_out.template mutable_data<T>(ctx.GetPlace()),
-                        param.numel());
+      int r = xpu::lamb(
+          dev_ctx.x_context(), grad.template data<T>(), mom1.template data<T>(),
+          mom2.template data<T>(), param.template data<T>(),
+          beta1_pow.template data<T>(), beta2_pow.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          param_out.template mutable_data<T>(ctx.GetPlace()),
+          beta1_pow_out.template mutable_data<T>(ctx.GetPlace()),
+          beta2_pow_out.template mutable_data<T>(ctx.GetPlace()), beta1, beta2,
+          epsilon, weight_decay, lr.template data<T>(), param.numel());
 
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument(
-                "XPU kernel error of LambOp, error message: INVALID_PARAM, "
-                "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of LambOp, error message: "
-                              "RUNTIME_ERROR, please check whether Baidu "
-                              "Kunlun Card is properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of LambOp, error "
-                              "message: NO_ENOUGH_WORKSPACE, XPU "
-                              "has no enough memory."));
-      } else {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of LambOp, error "
-                              "message: OTHER "
-                              "XPU API returns error code: %d.",
-                              r));
-      }
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "lamb");
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Variable type not supported by lamb_op. Expect LoDTensor, "
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
index 85c2d42c841f0..b53d51686cfd7 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <iostream>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -105,40 +106,15 @@ class RmspropOpXPUKernel : public framework::OpKernel<T> {
     /// const float* ms, const float* g, const float* mom,
     /// float epsilon, float rho, float momentum, float lr,
     /// float *ms_out, float *mom_out, float *p_out, int n)
-    int r = xpu::rmsprop(dev_ctx.x_context(), param.template data<T>(),
-                         meanSquare.template data<T>(), grad.template data<T>(),
-                         mom.template data<T>(), epsilon, decay, momentum, lr,
+    int r = xpu::rmsprop(dev_ctx.x_context(), grad.template data<T>(),
+                         param.template data<T>(),
+                         meanSquare.template data<T>(), mom.template data<T>(),
+                         param_out.template mutable_data<T>(ctx.GetPlace()),
                          mom_sqrt_out.template mutable_data<T>(ctx.GetPlace()),
                          mom_out.template mutable_data<T>(ctx.GetPlace()),
-                         param_out.template mutable_data<T>(ctx.GetPlace()),
-                         param.numel());
-
-    if (r == xpu::Error_t::INVALID_PARAM) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::InvalidArgument(
-              "XPU kernel error of RmspropOp, error message: INVALID_PARAM, "
-              "please check your input & output."));
-    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::Unavailable(
-                            "XPU kernel error of RmspropOp, error message: "
-                            "RUNTIME_ERROR, please check whether Baidu "
-                            "Kunlun Card is properly installed."));
-    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of RmspropOp, error "
-                            "message: NO_ENOUGH_WORKSPACE, XPU "
-                            "has no enough memory."));
-    } else {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of RmspropOp, error "
-                            "message: OTHER "
-                            "XPU API returns error code: %d.",
-                            r));
-    }
+                         epsilon, decay, momentum, lr, param.numel());
+
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "rmsprop");
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
index 9dabca1b66a77..e7c03be95cae1 100644
--- a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
@@ -14,11 +14,15 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class SGDOpXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
@@ -48,40 +52,31 @@ class SGDOpXPUKernel : public framework::OpKernel<T> {
                             "numel = [%s], ParamOut's numel = [%s]",
                             grad->numel(), sz));
 
-      const T *lr = learning_rate->data<T>();
+      const T *lr_t = learning_rate->data<T>();
+      auto &dev_ctx = ctx.template device_context<DeviceContext>();
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      const float *lr = nullptr;
+      if (std::is_same<T, paddle::platform::float16>::value) {
+        float *lr_float =
+            RAII_GUARD.alloc_l3_or_gm<float>(learning_rate->numel());
+        int r = xpu::cast_v2<XPUType, float>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUType *>(lr_t),
+            lr_float, learning_rate->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
+        lr = lr_float;
+      } else {
+        lr = reinterpret_cast<const float *>(lr_t);
+      }
+
       const T *param_data = param->data<T>();
       const T *grad_data = grad->data<T>();
       T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
 
-      auto &dev_ctx = ctx.template device_context<DeviceContext>();
-      int r = xpu::sgd(dev_ctx.x_context(), sz, grad_data, param_data, lr,
-                       out_data);
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument(
-                "XPU kernel error of SgdOp, error message: INVALID_PARAM, "
-                "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of SgdOp, error message: "
-                              "RUNTIME_ERROR, please check whether Baidu "
-                              "Kunlun Card is properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of SgdOp, error "
-                              "message: NO_ENOUGH_WORKSPACE, XPU "
-                              "has no enough memory."));
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false, true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Param & Grad in "
-                            "SgdOp-XPU. Excepted "
-                            "LodTensor, But received [%s] and [%s]",
-                            paddle::framework::ToTypeName(param_var->Type())));
+      int r = xpu::sgd(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType *>(grad_data),
+                       reinterpret_cast<const XPUType *>(param_data), lr,
+                       reinterpret_cast<XPUType *>(out_data), sz);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sgd");
     }
   }
 };
@@ -90,6 +85,8 @@ class SGDOpXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_XPU_KERNEL(
-    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
 #endif
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index a9646b2e8acb5..cbe58644f5381 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -123,7 +123,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
           column, depth,
           platform::errors::OutOfRange(
               "Attr(column) should be less than depth(the second "
-              "dimension of Input(Score)). Recieved Attr(column): %d, while "
+              "dimension of Input(Score)). Received Attr(column): %d, while "
               "depth is %d.",
               column, depth));
       PADDLE_ENFORCE_GE(
@@ -131,7 +131,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
           platform::errors::OutOfRange(
               "Attr(column) should be greater than equal to negative "
               "depth, i.e. the second dimension of Input(Score). "
-              "Recieved Attr(column): %d, while negative depth is %d.",
+              "Received Attr(column): %d, while negative depth is %d.",
               column, -depth));
     }
 
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 7228bdbf3805a..6a2ed6592e7fe 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -98,7 +98,7 @@ static void VisitDataType(paddle::experimental::DataType type,
     visitor.template apply<int64_t>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "The recieved values gate_id type %s can not meet input requirements. "
+        "The received values gate_id type %s can not meet input requirements. "
         "Because the given gate_id data type of operators must be "
         "int64. Please input appropriate gate_id again! ",
         "framework::DataTypeToString(type)"));
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 2df0d7526a3d3..457e37744d316 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -63,7 +63,7 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
     PADDLE_ENFORCE_EQ(pieces.size(), 2,
                       platform::errors::PreconditionNotMet(
                           "Invalid format of message_and_id argument. "
-                          "Expected \"message:block_id\". Recieved %s",
+                          "Expected \"message:block_id\". Received %s",
                           grad_and_id.c_str()));
     PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
                       platform::errors::AlreadyExists(
@@ -82,7 +82,7 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
   PADDLE_ENFORCE_GE(num_blocks, 1,
                     platform::errors::PreconditionNotMet(
                         "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 1. Recieved %zu",
+                        "equal or greater than 1. Received %zu",
                         num_blocks));
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 4b6759ea165ed..db0f5758d2f53 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
@@ -85,10 +86,27 @@ BufferedReader::BufferedReader(
     stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  if (platform::is_xpu_place(place_)) {
+    int dev_idx = place_.device;
+    compute_stream_ =
+        ((platform::XPUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::XpuEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::XpuStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
+
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
   mlu_buffer_.resize(buffer_size);
+  xpu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -322,6 +340,57 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::MLUStreamSync(stream_.get());
     }
 #endif
+
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(place_)) {
+      TensorVec &xpu = xpu_buffer_[i];
+      if (xpu.empty()) {
+        xpu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            xpu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on XPU and CPU devices are not matched. "
+                "The number on XPU is %d, on CPU is %d",
+                xpu.size(), cpu.size()));
+      }
+
+      std::vector<void *> xpu_ptrs;
+      xpu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        xpu[i].Resize(cpu[i].dims());
+        xpu[i].set_layout(cpu[i].layout());
+        xpu_ptrs.emplace_back(xpu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::XPUDeviceGuard gurad(place_.device);
+      int r = xpu_event_record(events_[i].get(), compute_stream_);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_event_record");
+      r = xpu_stream_wait_event(stream_.get(), events_[i].get());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_stream_wait_event");
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto xpu_ptr = xpu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        // TODO(zhanghuan) for now hardware not support xpu_memcpy_async, maybe
+        // KL3
+        if ((platform::is_xpu_place(cpu_place))) {
+          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
+          platform::XPUStreamSync(stream_.get());
+        } else {
+          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
+        }
+        xpu[i].set_lod(cpu[i].lod());
+      }
+      platform::XPUStreamSync(stream_.get());
+    }
+#endif
     return i;
   }));
 }
@@ -359,6 +428,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     *out = std::move(npu_buffer_[i]);
   } else if (platform::is_mlu_place(place_)) {
     *out = std::move(mlu_buffer_[i]);
+  } else if (platform::is_xpu_place(place_)) {
+    *out = std::move(xpu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
   }
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index f0f3b6b7f9fdf..52d3d8d6999a0 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,6 +33,10 @@
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -76,6 +80,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
   std::vector<TensorVec> mlu_buffer_;
+  std::vector<TensorVec> xpu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
@@ -94,6 +99,12 @@ class BufferedReader : public framework::DecoratedReader {
   std::shared_ptr<platform::MluStreamObject> stream_;
   std::vector<std::shared_ptr<platform::MluEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  xpuStream compute_stream_;
+  std::shared_ptr<platform::XpuStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::XpuEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index f99b72faba4ae..04660fb501142 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -105,6 +105,68 @@ class ReduceMaxNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Input<Tensor>("Out");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    int in_dtype = context.Attr<int>("in_dtype");
+
+    PADDLE_ENFORCE_EQ(
+        in_dtype == -1, true,
+        platform::errors::InvalidArgument(
+            "NPU only support in_dtype == -1 in reduce_max_grad op."));
+
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    x_grad->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::NPUDeviceContext>();
+    auto place = context.GetPlace();
+    auto stream = dev_ctx.stream();
+
+    // broadcast
+    auto x_dims_vec = phi::vectorize(x->dims());
+    Tensor transformed_out(x->type());
+    transformed_out.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out.mutable_data<T>(place);
+    NpuOpRunner r_brd_out;
+    r_brd_out.SetType("BroadcastTo")
+        .AddInput(*out)
+        .AddInput(std::move(x_dims_vec))
+        .AddOutput(transformed_out)
+        .Run(stream);
+    Tensor transformed_out_grad(x->type());
+    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out_grad.mutable_data<T>(place);
+    NpuOpRunner r_brd_out_grad;
+    r_brd_out_grad.SetType("BroadcastTo")
+        .AddInput(*out_grad)
+        .AddInput(std::move(x_dims_vec))
+        .AddOutput(transformed_out_grad)
+        .Run(stream);
+
+    // compare
+    Tensor equal_cond;
+    equal_cond.mutable_data<bool>(x_grad->dims(), place);
+    const auto& r_equal =
+        NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {});
+    r_equal.Run(stream);
+
+    // select
+    Tensor t_zero;
+    t_zero.mutable_data<T>(x_grad->dims(), place);
+    FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
+    t_zero.Resize(x_grad->dims());
+
+    const auto& r_sel = NpuOpRunner(
+        "SelectV2", {equal_cond, transformed_out_grad, t_zero}, {*x_grad}, {});
+    r_sel.Run(stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -115,3 +177,8 @@ REGISTER_OP_NPU_KERNEL(
     ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
     ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int64_t>,
     ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int>);
+REGISTER_OP_NPU_KERNEL(
+    reduce_max_grad, ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, float>,
+    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int64_t>,
+    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
index 220d91bf4faab..941e463f63cdc 100644
--- a/paddle/fluid/operators/rnn_op_xpu.cc
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -65,7 +65,7 @@ class RnnXPUKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     auto* dropout_mask = ctx.Output<Tensor>("DropoutState");
     auto* reserve_data = ctx.Output<Tensor>("Reserve");
-    // Attrbutes
+    // Attributes
     const int& num_layers = ctx.Attr<int>("num_layers");
     const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
     const int& hidden_size = ctx.Attr<int>("hidden_size");
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index 420c4c5f257ca..e02c7ade9a11a 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -58,7 +58,7 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput(
         "Probabilities",
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
-        "The probabilites of sampled positive and negtive labels.")
+        "The probabilities of sampled positive and negtive labels.")
         .AsIntermediate();
     AddOutput("LogitsDim", "Store dim information of Logits for gradient op")
         .AsIntermediate();
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index f186f95a2b961..ed173bb3ebfa9 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -22,7 +22,7 @@ using Tensor = framework::Tensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
-  // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits
+  // Add this check is due to Ascend SigmoidCrossEntropyWithLogits
   // and SigmoidCrossEntropyWithLogitsGrad does't supoort
   // attr normalize and ignore_index
   bool normalize = ctx.Attr<bool>("normalize");
diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc
new file mode 100644
index 0000000000000..43322e4b2e75b
--- /dev/null
+++ b/paddle/fluid/operators/slice_op_mlu.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/slice_op.h"
+
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/phi/kernels/funcs/slice_utils.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SliceMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
+
+    // Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int>(starts_tensor_list);
+    }
+
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int>(ends_tensor_list);
+    }
+
+    PADDLE_ENFORCE_EQ(
+        starts.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of starts must be equal to the size of axes."));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of ends must be equal to the size of axes."));
+
+    const auto& in_dims = input->dims();
+    auto slice_dims = out->dims();
+    bool reset_slice_dims = false;
+    if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") ||
+        starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) {
+      // Infer output dims
+      for (size_t i = 0; i < axes.size(); ++i) {
+        // when start == -1 && end == start+1
+        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+          auto ret =
+              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+          if (ret != decrease_axis.end()) {
+            ends[i] = in_dims[axes[i]];
+          }
+        }
+      }
+
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+      reset_slice_dims = true;
+      auto out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
+
+      out->Resize(out_dims);
+    }
+    if (slice_dims.size() != in_dims.size() && !reset_slice_dims) {
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+    }
+
+    int in_dim_size = input->dims().size();
+    if (static_cast<int>(axes.size()) != in_dim_size) {
+      std::vector<int> tmp_starts(in_dim_size, 0);
+      const auto& in_dims_vec = phi::vectorize(input->dims());
+      std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
+      for (size_t i = 0; i < axes.size(); ++i) {
+        tmp_starts[axes[i]] = starts[i];
+        tmp_ends[axes[i]] = ends[i];
+      }
+      starts.swap(tmp_starts);
+      ends.swap(tmp_ends);
+    }
+    std::vector<int> strides(in_dim_size, 1);
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc out_desc(slice_dims.size(),
+                               phi::vectorize(slice_dims).data(),
+                               ToCnnlDataType<T>());
+    MLUCnnl::StridedSlice(ctx, starts.data(), ends.data(), strides.data(),
+                          input_desc.get(), GetBasePtr(input), out_desc.get(),
+                          GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class SliceGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dinput = ctx.Output<Tensor>(framework::GradVarName("Input"));
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    // Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int>(starts_tensor_list);
+    }
+
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int>(ends_tensor_list);
+    }
+
+    const auto& in_dims = input->dims();
+    auto slice_dims = dout->dims();
+    if (slice_dims.size() != in_dims.size()) {
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+    }
+
+    int in_dim_size = input->dims().size();
+    if (static_cast<int>(axes.size()) != in_dim_size) {
+      std::vector<int> tmp_starts(in_dim_size, 0);
+      const auto& in_dims_vec = phi::vectorize(input->dims());
+      std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
+      for (size_t i = 0; i < axes.size(); ++i) {
+        tmp_starts[axes[i]] = starts[i];
+        tmp_ends[axes[i]] = ends[i];
+      }
+      starts.swap(tmp_starts);
+      ends.swap(tmp_ends);
+    }
+    std::vector<int> strides(in_dim_size, 1);
+
+    dinput->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc dout_desc(slice_dims.size(),
+                                phi::vectorize(slice_dims).data(),
+                                ToCnnlDataType<T>());
+    MLUCnnlTensorDesc dinput_desc(*dinput);
+    MLUCnnl::StridedSliceGrad(ctx, starts.data(), ends.data(), strides.data(),
+                              dout_desc.get(), GetBasePtr(dout),
+                              dinput_desc.get(), GetBasePtr(dinput));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(slice, ops::SliceMLUKernel<float>,
+                       ops::SliceMLUKernel<int>, ops::SliceMLUKernel<bool>,
+                       ops::SliceMLUKernel<int64_t>,
+                       ops::SliceMLUKernel<double>,
+                       ops::SliceMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(slice_grad, ops::SliceGradMLUKernel<float>,
+                       ops::SliceGradMLUKernel<int>,
+                       ops::SliceGradMLUKernel<bool>,
+                       ops::SliceGradMLUKernel<int64_t>,
+                       ops::SliceGradMLUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 33590c1d7cca0..8c6c083cde880 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -156,7 +156,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
     }
   }
 
-  // compute select rows seperately.
+  // compute select rows separately.
   if (!selectrow_index.empty()) {
     std::vector<const T *> sr_in_out_data;
     size_t rows = 0;
@@ -241,7 +241,7 @@ class SumKernel<platform::CUDADeviceContext, T>
       LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Ouput(out) must be Tensor,  SelectedRows or "
+          "Expected type of Output(out) must be Tensor,  SelectedRows or "
           "LodTensorArray. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index 963dfd3bf7720..e437975320cc5 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -149,7 +149,7 @@ class TDMChildKernel : public framework::OpKernel<T> {
                           output_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(out_type_match, true,
                       platform::errors::InvalidArgument(
-                          "Ouput(Child) & Output(LeafMask) holds the wrong "
+                          "Output(Child) & Output(LeafMask) holds the wrong "
                           "type, it holds %s, but "
                           "desires to be %s or %s",
                           paddle::framework::DataTypeToString(output_type),
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 5cd9feee82895..1583e5d84b233 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -95,7 +95,7 @@ An operator integrating the open-source
 https://arxiv.org/pdf/1512.02595v1.pdf),
 to compute Connectionist Temporal Classification (CTC) loss.
 It can be aliased as softmax with ctc, since a native softmax activation is
-interated to the warp-ctc library, to to normalize values for each row of the
+interated to the warp-ctc library, to normalize values for each row of the
 input tensor.
 
 More detail of CTC loss can be found by referring to
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index f29546c5210d9..24d39c25cf335 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -116,7 +116,7 @@ endif()
 
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
-# seperate init from device_context to avoid cycle dependencies
+# separate init from device_context to avoid cycle dependencies
 cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool)
 
 # memcpy depends on device_context, here add deps individually for
@@ -125,7 +125,7 @@ cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc x
     place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator)
 if(WITH_XPU)
-  target_link_libraries(device_context xpu_context)
+  target_link_libraries(device_context xpu_context xpu_resource_pool)
 endif()
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index eb82389702ca4..6da5d1244fbed 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -50,11 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);
 
-#ifdef PADDLE_WITH_TESTING
 PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
                             "Whether to print the message of gpu memory usage "
                             "at exit, mainly used for UT and CI.");
-#endif
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true,
+                            "Whether to print the message of gpu memory usage "
+                            "MB as a unit of measurement.");
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
@@ -145,25 +146,32 @@ class RecordedGpuMallocHelper {
       mtx_.reset(new std::mutex());
     }
 
-#ifdef PADDLE_WITH_TESTING
     if (FLAGS_enable_gpu_memory_usage_log) {
       // A fake UPDATE to trigger the construction of memory stat instances,
       // make sure that they are destructed after RecordedGpuMallocHelper.
       MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+      MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
     }
-#endif
   }
 
   DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);
 
  public:
   ~RecordedGpuMallocHelper() {
-#ifdef PADDLE_WITH_TESTING
     if (FLAGS_enable_gpu_memory_usage_log) {
-      std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : "
-                << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl;
+      if (FLAGS_enable_gpu_memory_usage_log_mb) {
+        std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
+                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0
+                  << ", Allocated = "
+                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0
+                  << std::endl;
+      } else {
+        std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
+                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
+                  << ", Allocated = "
+                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl;
+      }
     }
-#endif
   }
 
   static RecordedGpuMallocHelper *Instance(int dev_id) {
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 4301ef4bcf126..61ea0fd3cd293 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -50,6 +50,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclInt64;
   } else if (type == framework::proto::VarType::FP16) {
     return ncclFloat16;
+  } else if (type == framework::proto::VarType::INT8) {
+    return ncclInt8;
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 96c2b4f9a9ded..4f15ecf3babf2 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -197,7 +197,9 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   }
   VLOG(10) << "Prepared inputs/anchors";
 
-  if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) {
+  if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched &&
+      !(ipu_strategy_->popart_options.createImplicitPipeliningFwdOnlyProgram &&
+        ipu_strategy_->runtime_options.enable_eval)) {
     popart::Optimizer *optimizer;
     if (ipu_strategy_->runtime_options.enable_eval) {
       VLOG(10) << "Switch optimizer to eval mode";
@@ -215,7 +217,12 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
 
   popart::StepIO stepio(popart_inputs, popart_anchors);
   VLOG(10) << "Running...";
-  session_->run(stepio);
+  if (ipu_strategy_->popart_options.createImplicitPipeliningFwdOnlyProgram &&
+      ipu_strategy_->runtime_options.enable_eval) {
+    session_->run("implicitPipeliningFwdOnly", stepio);
+  } else {
+    session_->run(stepio);
+  }
   VLOG(10) << "Running...done";
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index aff5498243000..714f44c69b0d9 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -32,6 +32,20 @@ void RegisterGetter(
   options_type[name] = type_str;
 }
 
+struct DefaultCompilationProgressLogger {
+  void operator()(int progress, int total) {
+    if (progress != progress_ && progress % log_interval_ == 0) {
+      progress_ = progress;
+      VLOG(1) << "Graph compile progress: " << progress << "%";
+    }
+  }
+
+  int log_interval_ = 10;
+  int progress_ = 0;
+  // default total progress
+  int total_ = 100;
+};
+
 }  // namespace
 
 namespace paddle {
@@ -271,6 +285,8 @@ IpuStrategy::IpuStrategy() {
   ADD_POPART_BOOL_OPTION_ALIAS(
       schedule_non_weight_update_gradient_consumers_early,
       scheduleNonWeightUpdateGradientConsumersEarly);
+  ADD_POPART_BOOL_OPTION_ALIAS(create_implicit_pipelining_fwd_only_program,
+                               createImplicitPipeliningFwdOnlyProgram);
 
   ADD_POPART_DOUBLE_OPTION_ALIAS(outline_sequence_break_cost,
                                  outlineSequenceBreakCost);
@@ -327,21 +343,26 @@ IpuStrategy::IpuStrategy() {
         return std::to_string(popart_options.partialsTypeMatMuls == "half");
       });
 
-  RegisterSetter(
-      container_options, "dot_checks",
-      [&](const std::pair<std::string, std::string>& p) {
-        std::uint64_t value = std::stoul(p.first);
-        popart_options.dotChecks.insert(static_cast<popart::DotCheck>(value));
-      });
+  RegisterSetter(container_options, "dot_checks",
+                 [&](const std::pair<std::string, std::string>& p) {
+                   std::vector<std::string> valid_dot{"Fwd0", "Fwd1", "Bwd0",
+                                                      "PreAlias", "Final"};
+                   if (std::find(valid_dot.begin(), valid_dot.end(), p.first) ==
+                       valid_dot.end()) {
+                     PADDLE_THROW(platform::errors::InvalidArgument(
+                         "Unknown dot check: %s", p.first));
+                   }
+                   popart_options.dotChecks.insert(p.first);
+                 });
 
-  RegisterGetter(
-      vector_options_getter, options_type, "dot_checks", "vector", [&]() {
-        std::vector<std::string> res;
-        for (auto x : popart_options.dotChecks) {
-          res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
-        }
-        return res;
-      });
+  RegisterGetter(vector_options_getter, options_type, "dot_checks", "vector",
+                 [&]() {
+                   std::vector<std::string> res;
+                   for (auto x : popart_options.dotChecks) {
+                     res.push_back(x);
+                   }
+                   return res;
+                 });
 
   RegisterSetter(container_options, "hardware_instrumentations",
                  [&](const std::pair<std::string, std::string>& p) {
@@ -417,11 +438,7 @@ IpuStrategy::IpuStrategy() {
   // Default options
 
   // Can also be set as a custom logger in python, like using tqdm
-  popart_options.compilationProgressLogger = [](int progress, int total) {
-    if (progress % 10 == 0) {
-      VLOG(1) << "compile progress: " << progress << "%";
-    }
-  };
+  popart_options.compilationProgressLogger = DefaultCompilationProgressLogger();
 }
 
 void IpuStrategy::AddBoolOption(const std::string& option, bool value) {
@@ -506,6 +523,21 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   }
 }
 
+void IpuStrategy::SetReplicatedCollectivesSettings(const std::string& opt,
+                                                   bool value) {
+  VLOG(10) << "Set Replica Setting " << opt << " to " << value;
+  if (opt == "prepare_schedule_for_merging_collectives") {
+    popart_options.replicatedCollectivesSettings
+        .prepareScheduleForMergingCollectives = value;
+  } else if (opt == "merge_all_reduce_collectives") {
+    popart_options.replicatedCollectivesSettings.mergeAllReduceCollectives =
+        value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unknown option ' %s' for replicated collectives settings", opt));
+  }
+}
+
 void IpuStrategy::SetAccumulateOuterFragmentSettings(
     const std::uint64_t& schedule, const std::vector<int>& values) {
   VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule;
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index fa57dcd676d81..da08c76fb90d1 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -118,6 +118,7 @@ class IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void SetReplicatedCollectivesSettings(const std::string &opt, bool value);
   void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
                                           const std::vector<int> &values);
   void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index 2409c14b760fd..739a3ef41e422 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -70,7 +70,7 @@ class NpuOpRunner {
   NpuOpRunner &AddInput(const Tensor &tensor);
 
   // NOTE(zhiqiu): CANN-5.0.2 support input tensors on host.
-  // Specifically, the tensor of shape, tensor of dims, etc, which are are small
+  // Specifically, the tensor of shape, tensor of dims, etc, which are small
   // vector/list.
   NpuOpRunner &AddInput(const Tensor &tensor, aclMemType mem_type);
 
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index b6a26f2554a13..3399fff087f8d 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -7,5 +7,6 @@ set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
 cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type)
+cc_library(xpu_resource_pool SRCS xpu_resource_pool.cc DEPS xpu_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 77019a0192312..99f8e5ace9c00 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -307,11 +307,13 @@ XPUOpMap& get_kl2_ops() {
                              pOpKernelType(vartype::FP16, XPUPlace())})},
       {"reshape2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::FP16, XPUPlace()),
                                  pOpKernelType(vartype::INT64, XPUPlace()),
                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                  pOpKernelType(vartype::BOOL, XPUPlace()),
@@ -326,6 +328,8 @@ XPUOpMap& get_kl2_ops() {
                               pOpKernelType(vartype::INT64, XPUPlace())})},
       {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
       {"sigmoid_cross_entropy_with_logits_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sigmoid_cross_entropy_with_logits",
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index 2e960c1c0dd9c..cdd7ee7f806e9 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -79,6 +79,10 @@ void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
                                     *dev_ctx);
 }
 
+void XPUStreamSync(xpuStream stream) {
+  PADDLE_ENFORCE_XDNN_SUCCESS(xpu_wait(stream), "xpu_wait");
+}
+
 /**************************** Others **************************/
 
 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id) {
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 33385f8e45937..38b4defadc6c3 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -14,8 +14,13 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
+#include "xpu/runtime.h"
 
 namespace paddle {
+
+using xpuStream = XPUStream;
+using xpuEventHandle = XPUEvent;
+
 namespace platform {
 
 /***** Version Management *****/
@@ -51,6 +56,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
                    const void *src, const platform::XPUPlace &src_place,
                    size_t count);
 
+//! Blocks until stream has completed all operations.
+void XPUStreamSync(xpuStream stream);
+
 using XPUDeviceGuard = phi::backends::xpu::XPUDeviceGuard;
 
 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id);
diff --git a/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
new file mode 100644
index 0000000000000..af0d47c716717
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+XpuStreamResourcePool::XpuStreamResourcePool() {
+  int dev_cnt = platform::GetXPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpuStream stream;
+      xpu_stream_create(&stream);
+      return stream;
+    };
+
+    auto deleter = [dev_idx](xpuStream stream) {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpu_stream_destroy(stream);
+    };
+
+    pool_.emplace_back(ResourcePool<XpuStreamObject>::Create(creator, deleter));
+  }
+}
+
+XpuStreamResourcePool& XpuStreamResourcePool::Instance() {
+  static XpuStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<XpuStreamObject> XpuStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+XpuEventResourcePool::XpuEventResourcePool() {
+  int dev_cnt = platform::GetXPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpuEventHandle event;
+      xpu_event_create(&event);
+      return event;
+    };
+
+    auto deleter = [dev_idx](xpuEventHandle event) {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpu_event_destroy(event);
+    };
+
+    pool_.emplace_back(ResourcePool<XpuEventObject>::Create(creator, deleter));
+  }
+}
+
+XpuEventResourcePool& XpuEventResourcePool::Instance() {
+  static XpuEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<XpuEventObject> XpuEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_resource_pool.h b/paddle/fluid/platform/device/xpu/xpu_resource_pool.h
new file mode 100644
index 0000000000000..5c6ade8f6f88f
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_XPU)
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using XpuStreamObject = std::remove_pointer<xpuStream>::type;
+using XpuEventObject = std::remove_pointer<xpuEventHandle>::type;
+
+class XpuStreamResourcePool {
+ public:
+  std::shared_ptr<XpuStreamObject> New(int dev_idx);
+
+  static XpuStreamResourcePool &Instance();
+
+ private:
+  XpuStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(XpuStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<XpuStreamObject>>> pool_;
+};
+
+class XpuEventResourcePool {
+ public:
+  std::shared_ptr<XpuEventObject> New(int dev_idx);
+
+  static XpuEventResourcePool &Instance();
+
+ private:
+  XpuEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(XpuEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<XpuEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2c5f24d28c6d6..2b53ecf86a641 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -188,6 +188,7 @@ class XPUDeviceContext : public phi::XPUContext {
   explicit XPUDeviceContext(XPUPlace place);
   virtual ~XPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
 };
 
 template <>
diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc
index 6c8be1811d715..b909fb5f25aa7 100644
--- a/paddle/fluid/platform/profiler/event_node.cc
+++ b/paddle/fluid/platform/profiler/event_node.cc
@@ -51,7 +51,7 @@ void NodeTrees::BuildTrees(
     const std::vector<HostTraceEventNode*>& host_event_nodes,
     std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
     const std::vector<DeviceTraceEventNode*>& device_event_nodes) {
-  // seperate Host Event Nodes into different threads
+  // separate Host Event Nodes into different threads
   std::map<uint64_t, std::vector<HostTraceEventNode*>>
       thread2host_event_nodes;  // used to store HostTraceEventNodes per thread
   std::map<uint64_t, std::vector<CudaRuntimeTraceEventNode*>>
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index 1a6a395545a96..aef02d65b4dbd 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -58,13 +58,16 @@ void BindTCPStore(py::module *m) {
 
   py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore", Store)
       .def(py::init([](std::string hostname, uint16_t port, bool is_master,
-                       size_t world_size, std::chrono::seconds timeout) {
+                       size_t world_size, std::chrono::seconds timeout,
+                       int stop_check_timeout) {
              return std::make_shared<TCPStore>(hostname, port, is_master,
-                                               world_size, timeout);
+                                               world_size, timeout,
+                                               stop_check_timeout);
            }),
            py::arg("hostname"), py::arg("port"), py::arg("is_master"),
            py::arg("world_size"),
            py::arg("timeout") = distributed::tcputils::kNoTimeout,
+           py::arg("stop_check_timeout") = 900,
            py::call_guard<py::gil_scoped_release>());
 }
 
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index ac33eb2359c8c..5395b4f31c83b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -207,7 +207,8 @@ static void ConstructFwdAndBwdMap(
     auto grad_attrs_names =
         paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
     std::vector<std::unordered_map<int, int>> res(5);
-    in_out_map.insert({op_type, res});
+
+    in_out_map.insert({op_type, {res}});
     // Prepare pos map for grad_outputs
     VLOG(7) << "Prepare pos map for grad_outputs";
     PADDLE_ENFORCE_LE(
@@ -227,7 +228,7 @@ static void ConstructFwdAndBwdMap(
           VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                   << " inputs: " << inputs_names[j] << " related to No." << i
                   << " grad_outputs: " << grad_outputs_names[i];
-          in_out_map[op_type][0][j] = i;
+          in_out_map[op_type][0][0][j] = i;
         }
       }
     }
@@ -240,7 +241,7 @@ static void ConstructFwdAndBwdMap(
             VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                     << " outputs: " << outputs_names[j] << " related to No."
                     << i << " grad_inputs's grad: " << grad_inputs_names[i];
-            in_out_map[op_type][1][j] = i;
+            in_out_map[op_type][0][1][j] = i;
           }
         }
       } else {
@@ -252,7 +253,7 @@ static void ConstructFwdAndBwdMap(
                       << " outputs: " << outputs_names[j] << " related to No."
                       << i
                       << " grad_inputs fwd outputs: " << grad_inputs_names[i];
-              in_out_map[op_type][2][j] = i;
+              in_out_map[op_type][0][2][j] = i;
             }
           }
         } else {
@@ -262,7 +263,7 @@ static void ConstructFwdAndBwdMap(
                       << " inputs: " << inputs_names[j] << " related to No."
                       << i
                       << " grad_inputs fwd inputs: " << grad_inputs_names[i];
-              in_out_map[op_type][3][j] = i;
+              in_out_map[op_type][0][3][j] = i;
             }
           }
         }
@@ -284,7 +285,7 @@ static void ConstructFwdAndBwdMap(
           VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                   << " attrs: " << attrs_names[j] << " related to No." << i
                   << " grad_attrs: " << grad_attrs_names[i];
-          in_out_map[op_type][4][j] = i;
+          in_out_map[op_type][0][4][j] = i;
         }
       }
     }
@@ -402,8 +403,8 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
           ctx.InputsBetween(ctx.InputRangeAt(i).first,
                             ctx.InputRangeAt(i).second);
 
-      if (slot_map[0].find(i) != slot_map[0].end()) {
-        grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
+      if (slot_map[0][0].find(i) != slot_map[0][0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]);
       } else {
         grad_node->SetGradOutMeta(in_tensors,
                                   ins_auto_grad_metas.size() - 1 - no_grad_cnt);
@@ -423,7 +424,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
 
     // Prepare Grad inputs with fwd outputs
-    for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) {
+    for (auto it = slot_map[0][2].begin(); it != slot_map[0][2].end(); it++) {
       VLOG(7) << "Prepare fwd_outs: " << it->first
               << " to grad_inputs: " << it->second;
       grad_node->fwd_outs[it->second] =
@@ -433,7 +434,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
 
     // Prepare Grad inputs with fwd inputs
-    for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) {
+    for (auto it = slot_map[0][3].begin(); it != slot_map[0][3].end(); it++) {
       VLOG(7) << "Prepare fwd_ins: " << it->first
               << " to grad_inputs: " << it->second;
       grad_node->fwd_ins[it->second] =
@@ -446,7 +447,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
         meta_info_map.at(op_type)[1]);
     std::vector<paddle::any> attrs(attrs_names.size());
     // Prepare attrs for Grad node
-    for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) {
+    for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) {
       VLOG(7) << "Prepare fwd attrs: " << it->first
               << " to grad_attrs: " << it->second;
       attrs[it->second] = res_attrs[it->first];
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index d3393b7cb57ac..4d7b80a4e8c96 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -18,6 +18,7 @@ typedef SSIZE_T ssize_t;
 #include <Python.h>
 
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "pybind11/numpy.h"
@@ -361,12 +362,33 @@ static PyObject* tensor_method__is_dense_tensor_hold_allocation(
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static void IncreaseTensorReferenceCountUntilCopyComplete(
+    const paddle::experimental::Tensor& tensor, const platform::Place& place) {
+  auto place_ = platform::is_gpu_place(place) ? place : tensor.place();
+
+  auto tracer = egr::Controller::Instance().GetCurrentTracer();
+  auto gc = tracer->MutableGarbageCollectorIfNotExists(place_);
+
+  // Note(dev): This is an empty callback, the only way is to "reference"
+  // inner memory Holder, so it will not be destructed until the kernels
+  // launched at current stream of given place is finished, such as
+  // CUDAPinned Mem -> CUDA by cudamemcpyAsync.
+  auto callback = [tensor, place_]() {
+    VLOG(3) << "Run callback of Tensor:" << tensor.name() << " at place "
+            << place_;
+  };
+  gc->DirectClearCallback(callback);
+}
+
 static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor = self->tensor.copy_to(place, blocking);
+  if (!blocking) {
+    IncreaseTensorReferenceCountUntilCopyComplete(self->tensor, place);
+  }
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
   egr::EagerUtils::autograd_meta(&cp_tensor)
       ->SetPersistable(
@@ -654,7 +676,9 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
                                                     PyObject* kwargs) {
   EAGER_TRY
   if (!self->tensor.defined()) {
-    RETURN_PY_NONE
+    // The original `get_tensor` method of Variable will create a empty tensor
+    phi::DenseTensor empty_tensor;
+    return ToPyObject(&empty_tensor);
   }
   if (self->tensor.is_dense_tensor()) {
     auto* tensor =
@@ -1254,6 +1278,47 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self, PyObject* args,
 
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
+
+static PyObject* tensor_method_set_vocab(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  using Vocab = std::unordered_map<std::wstring, int>;
+  auto vocab = CastPyArg2Vocab(PyTuple_GET_ITEM(args, 0), 0);
+  auto var_tensor = std::make_shared<egr::VariableCompatTensor>();
+  *var_tensor->GetMutable<Vocab>() = vocab;
+  self->tensor.set_impl(var_tensor);
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_set_string_list(TensorObject* self,
+                                               PyObject* args,
+                                               PyObject* kwargs) {
+  EAGER_TRY
+  using Strings = std::vector<std::string>;
+  auto strings = CastPyArg2Strings(PyTuple_GET_ITEM(args, 0), 0);
+  auto var_tensor = std::make_shared<egr::VariableCompatTensor>();
+  *var_tensor->GetMutable<Strings>() = strings;
+  self->tensor.set_impl(var_tensor);
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_map_tensor(TensorObject* self,
+                                              PyObject* args,
+                                              PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE_EQ(
+      egr::IsVariableCompatTensor(self->tensor), true,
+      paddle::platform::errors::Fatal(
+          "this method is only effective for VariableCompatTensor"));
+  using Vocab = std::unordered_map<std::wstring, int>;
+  auto* var_tensor =
+      static_cast<const egr::VariableCompatTensor*>(self->tensor.impl().get());
+  return ToPyObject(var_tensor->Get<Vocab>());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
                                                     PyObject* args,
                                                     PyObject* kwargs) {
@@ -1634,6 +1699,15 @@ PyMethodDef variable_methods[] = {
     {"_copy_gradient_from",
      (PyCFunction)(void (*)(void))tensor__copy_gradient_from,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    /** the methods to adapt old dygraph, will be removed in the future **/
+    {"set_string_list",
+     (PyCFunction)(void (*)(void))tensor_method_set_string_list,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"set_vocab", (PyCFunction)(void (*)(void))tensor_method_set_vocab,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"get_map_tensor",
+     (PyCFunction)(void (*)(void))tensor_method_get_map_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     /***the method of sparse tensor****/
     {"indices", (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices,
      METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 0473c29a3342b..590ecfbad4be5 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -58,6 +58,10 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
     return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
   } else if (self->tensor.is_selected_rows()) {
     return ToPyObject(paddle::framework::proto::VarType::SELECTED_ROWS);
+  } else if (egr::IsVariableCompatTensor(self->tensor)) {
+    return ToPyObject(static_cast<paddle::framework::proto::VarType::Type>(
+        static_cast<const egr::VariableCompatTensor*>(self->tensor.impl().get())
+            ->Type()));
   } else {
     RETURN_PY_NONE
   }
@@ -152,11 +156,27 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
   if (!self->tensor.defined()) {
     return ToPyObject(value);
   }
-  auto ddim = self->tensor.shape();
-  size_t rank = static_cast<size_t>(ddim.size());
-  value.resize(rank);
-  for (size_t i = 0; i < rank; i++) {
-    value[i] = ddim[i];
+  if (egr::IsVariableCompatTensor(self->tensor)) {
+    auto* var_tensor = static_cast<const egr::VariableCompatTensor*>(
+        self->tensor.impl().get());
+    if (var_tensor->IsType<paddle::framework::Vocab>()) {
+      value.emplace_back(static_cast<int64_t>(
+          var_tensor->Get<paddle::framework::Vocab>().size()));
+    } else if (var_tensor->IsType<paddle::framework::Strings>()) {
+      value.emplace_back(static_cast<int64_t>(
+          var_tensor->Get<paddle::framework::Strings>().size()));
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "VariableCompatTensor only support get shape from Vocab or "
+          "Strings."));
+    }
+  } else {
+    auto ddim = self->tensor.shape();
+    size_t rank = static_cast<size_t>(ddim.size());
+    value.resize(rank);
+    for (size_t i = 0; i < rank; i++) {
+      value[i] = ddim[i];
+    }
   }
 
   return ToPyObject(value);
@@ -183,8 +203,22 @@ PyObject* tensor_properties_get_dtype(TensorObject* self, void* closure) {
     // be same to old dygraph
     return ToPyObject(framework::proto::VarType::FP32);
   }
-  return ToPyObject(
-      paddle::framework::TransToProtoVarType(self->tensor.type()));
+  if (egr::IsVariableCompatTensor(self->tensor)) {
+    auto* var_tensor = static_cast<const egr::VariableCompatTensor*>(
+        self->tensor.impl().get());
+    if (var_tensor->IsType<paddle::framework::Vocab>()) {
+      return ToPyObject(framework::proto::VarType::RAW);
+    } else if (var_tensor->IsType<paddle::framework::Strings>()) {
+      return ToPyObject(framework::proto::VarType::STRING);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "VariableCompatTensor only support get shape from Vocab or "
+          "Strings."));
+    }
+  } else {
+    return ToPyObject(
+        paddle::framework::TransToProtoVarType(self->tensor.type()));
+  }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 90d7024f7a746..4707f757d8bfb 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -472,6 +472,28 @@ paddle::framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
   return dtype;
 }
 
+std::unordered_map<std::wstring, int> CastPyArg2Vocab(PyObject* obj,
+                                                      ssize_t arg_pos) {
+  if (PyDict_Check(obj)) {
+    return ::pybind11::handle(obj)
+        .cast<std::unordered_map<std::wstring, int>>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be dict, but got %s", arg_pos + 1,
+        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+
+std::vector<std::string> CastPyArg2Strings(PyObject* obj, ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    return ::pybind11::handle(obj).cast<std::vector<std::string>>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be list, but got %s", arg_pos + 1,
+        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+
 paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
                                                               ssize_t arg_pos) {
   if (PyObject_IsInstance(
@@ -719,6 +741,28 @@ PyObject* ToPyObject(
   return dict;
 }
 
+PyObject* ToPyObject(const std::unordered_map<std::wstring, int>& value) {
+  PyObject* dict = PyDict_New();
+  for (const auto map_iter : value) {
+    // Convert Key
+    PyObject* key_string =
+        PyUnicode_FromWideChar(map_iter.first.c_str(), map_iter.first.size());
+    if (!key_string) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Unable to convert std::wstring to PyObject"));
+    }
+
+    // Convert Val
+    PyObject* py_int = PyLong_FromLong(map_iter.second);
+
+    if (PyDict_SetItem(dict, key_string, py_int) != 0) {
+      PADDLE_THROW(
+          platform::errors::Fatal("Unable to set key:value for py_dict"));
+    }
+  }
+  return dict;
+}
+
 // For Final State Dygraph,
 // We directly use paddle::optional(Tensor) as dispensable Tensor
 paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
@@ -1045,7 +1089,7 @@ paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj,
   } else if (type_name == "numpy.int64") {
     int64_t value = CastPyArg2Long(obj, op_type, arg_pos);
     return paddle::experimental::Scalar(value);
-  } else if (type_name == "numpy.int32") {
+  } else if (type_name == "numpy.int32" || type_name == "numpy.intc") {
     int value = CastPyArg2Int(obj, op_type, arg_pos);
     return paddle::experimental::Scalar(value);
   } else {
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 5273433208d11..c8e1cd4ad0b75 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -65,6 +65,9 @@ std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
     PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                     ssize_t arg_pos);
+std::unordered_map<std::wstring, int> CastPyArg2Vocab(PyObject* obj,
+                                                      ssize_t arg_pos);
+std::vector<std::string> CastPyArg2Strings(PyObject* obj, ssize_t arg_pos);
 
 PyObject* ToPyObject(int value);
 PyObject* ToPyObject(uint32_t value);
@@ -96,6 +99,7 @@ PyObject* ToPyObject(const paddle::framework::proto::VarType& type);
 PyObject* ToPyObject(const void* value);
 PyObject* ToPyObject(
     const std::unordered_map<std::string, std::vector<std::string>>& value);
+PyObject* ToPyObject(const std::unordered_map<std::wstring, int>& value);
 
 template <typename Tuple, size_t N>
 struct TupleTensorResult {
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index bcf55e46edb76..2549240aa15da 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -327,16 +327,15 @@ void BindNeighborSampleResult(py::module* m) {
       .def("initialize", &NeighborSampleResult::initialize)
       .def("get_len", &NeighborSampleResult::get_len)
       .def("get_val", &NeighborSampleResult::get_actual_val)
+      .def("get_sampled_graph", &NeighborSampleResult::get_sampled_graph)
       .def("display", &NeighborSampleResult::display);
 }
 
 void BindGraphGpuWrapper(py::module* m) {
-  py::class_<GraphGpuWrapper>(*m, "GraphGpuWrapper")
-      // nit<>())
-      //.def("test", &GraphGpuWrapper::test)
-      //.def(py::init([]() { return framework::GraphGpuWrapper::GetInstance();
-      //}))
-      .def(py::init<>())
+  py::class_<GraphGpuWrapper, std::shared_ptr<GraphGpuWrapper>>(
+      *m, "GraphGpuWrapper")
+      .def(py::init([]() { return GraphGpuWrapper::GetInstance(); }))
+      // .def(py::init<>())
       .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3)
       .def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample)
       .def("set_device", &GraphGpuWrapper::set_device)
@@ -347,6 +346,8 @@ void BindGraphGpuWrapper(py::module* m) {
       .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
       .def("upload_batch", &GraphGpuWrapper::upload_batch)
       .def("get_all_id", &GraphGpuWrapper::get_all_id)
+      .def("init_sample_status", &GraphGpuWrapper::init_sample_status)
+      .def("free_sample_status", &GraphGpuWrapper::free_sample_status)
       .def("load_next_partition", &GraphGpuWrapper::load_next_partition)
       .def("make_partitions", &GraphGpuWrapper::make_partitions)
       .def("make_complementary_graph",
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 1bbe6808b2846..944781484076b 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -601,6 +601,14 @@ void BindAnalysisConfig(py::module *m) {
       .def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId,
            py::arg("device_id") = 0)
       .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
+      .def("enable_ipu", &AnalysisConfig::EnableIpu,
+           py::arg("ipu_device_num") = 1, py::arg("ipu_micro_batch_size") = 1,
+           py::arg("ipu_enable_pipelining") = false,
+           py::arg("ipu_batches_per_step") = 1)
+      .def("set_ipu_config", &AnalysisConfig::SetIpuConfig,
+           py::arg("ipu_enable_fp16") = false, py::arg("ipu_replica_num") = 1,
+           py::arg("ipu_available_memory_proportion") = 1.0,
+           py::arg("ipu_enable_half_partial") = false)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime)
       .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 602a0345b04fe..f6be9b66d5dbd 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -166,6 +166,10 @@ limitations under the License. */
 #include "paddle/fluid/pybind/fleet_py.h"
 #endif
 
+#ifdef PADDLE_WITH_CINN
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#endif
+
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/pybind/eager_utils.h"
@@ -1930,16 +1934,18 @@ All parameter, weight, gradient are variables in Paddle.
                    which contains the id pair of pruned block and corresponding
                    origin block.
            )DOC");
-  m.def("get_readable_comile_key", [](const OpDesc &op_desc) {
-    auto compilation_key =
-        BOOST_GET_CONST(std::string, op_desc.GetAttr("compilation_key"));
-    VLOG(4) << std::hash<std::string>{}(compilation_key) << " "
-            << compilation_key.size();
-    proto::ProgramDesc desc;
-    desc.ParseFromString(compilation_key);
-    auto s = desc.DebugString();
+  m.def("get_serialize_comile_key", [](int64_t compilation_key) {
+#ifdef PADDLE_WITH_CINN
+    auto compiler = framework::paddle2cinn::CinnCompiler::GetInstance();
+    auto s = compiler->SerializeKey(compilation_key);
     VLOG(4) << s;
     return s;
+#else
+    PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot get compilation key in non-CINN version, "
+                 "Please recompile or reinstall Paddle with CINN support."));
+#endif
   });
   m.def("empty_var_name",
         []() { return std::string(framework::kEmptyVarName); });
@@ -4394,6 +4400,12 @@ All parameter, weight, gradient are variables in Paddle.
                          option_name, option.first.cast<std::string>(),
                          option.second.cast<std::uint64_t>());
                    }
+                 } else if (option_name == "replicated_collectives_settings") {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     self.SetReplicatedCollectivesSettings(
+                         option.first.cast<std::string>(),
+                         option.second.cast<bool>());
+                   }
                  } else if (option_name == "accumulate_outer_fragment") {
                    for (auto option : element.second.cast<py::dict>()) {
                      std::vector<int> values;
diff --git a/paddle/infrt/common/object.h b/paddle/infrt/common/object.h
index ab2d00cce985c..797595cc7c58b 100644
--- a/paddle/infrt/common/object.h
+++ b/paddle/infrt/common/object.h
@@ -25,7 +25,7 @@ template <typename T>
 class Shared;
 /**
  * Object is the basic element in the INFRT, with `Shared` wrapper, the object
- * can be shared accross the system.
+ * can be shared across the system.
  */
 struct Object {
   //! Get the type representation of this object.
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 0595ea4d8bddf..58ad42ddd1ff8 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -23,7 +23,7 @@ add_subdirectory(tools)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar)
+set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar api_int_array)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index d88f937a8b875..7d28e3d27c496 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -169,6 +169,7 @@ add_custom_command(
   COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file}
                  --api_yaml_path ${api_yaml_file} ${new_api_yaml_file}
                  --api_header_path ${api_header_file_tmp}
+                 --api_header_path ${api_header_file_tmp}
                  --api_source_path ${api_source_file_tmp}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file}
@@ -270,3 +271,4 @@ cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi k
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api)
 cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils)
 cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy)
+cc_library(api_int_array SRCS int_array.cc DEPS tensor_copy)
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 38a60ab978900..d80444e7f710c 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -592,19 +592,20 @@ Tensor conv2d_impl(const Tensor& input,
   return api_output;
 }
 
-std::vector<std::vector<Tensor>> conv2d_grad_impl(
-    const Tensor& input,
-    const Tensor& filter,
-    const Tensor& out_grad,
-    const std::vector<int>& strides,
-    const std::vector<int>& paddings,
-    const std::string& paddding_algorithm,
-    int groups,
-    const std::vector<int>& dilations,
-    const std::string& data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search) {
+void conv2d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad) {
   Backend kernel_backend = Backend::UNDEFINED;
   DataLayout kernel_layout = DataLayout::UNDEFINED;
   DataType kernel_data_type = DataType::UNDEFINED;
@@ -646,18 +647,15 @@ std::vector<std::vector<Tensor>> conv2d_grad_impl(
   auto input_filter = PrepareData(filter, args1, {});
   auto input_out_grad = PrepareData(out_grad, args2, {});
 
-  std::vector<std::vector<Tensor>> api_output(2);
-  api_output[0].emplace_back();
-  auto kernel_out_0 = SetKernelOutput(kernel_backend, &api_output[0][0]);
-  api_output[1].emplace_back();
-  auto kernel_out_1 = SetKernelOutput(kernel_backend, &api_output[1][0]);
+  auto kernel_out_0 = SetKernelOutput(kernel_backend, input_grad);
+  auto kernel_out_1 = SetKernelOutput(kernel_backend, filter_grad);
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
 
   phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input),
                                   MakeMetaTensor(*input_filter),
-                                  &meta_out_0,
-                                  &meta_out_1);
+                                  kernel_out_0 ? &meta_out_0 : nullptr,
+                                  kernel_out_1 ? &meta_out_1 : nullptr);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
@@ -693,8 +691,6 @@ std::vector<std::vector<Tensor>> conv2d_grad_impl(
                  kernel_out_0,
                  kernel_out_1);
   }
-
-  return api_output;
 }
 
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
@@ -1080,8 +1076,9 @@ std::tuple<Tensor, Tensor> sgd_impl(
 // but if we use this impl, it will not support. We need to be able to reuse
 // the autograd API here, which is not yet implemented
 // TODO(chenweihang): we should support call generated api in custom api impl
-std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
-                                    const Tensor& out_grad) {
+void add_n_grad_impl(const std::vector<Tensor>& x,
+                     const Tensor& out_grad,
+                     std::vector<Tensor*> x_grad) {
   auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
   auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
 
@@ -1099,9 +1096,7 @@ std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
 
   auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
 
-  size_t out_number = x.size();
-  std::vector<Tensor> x_grad;
-  auto dense_x_grad = SetKernelOutput(out_number, kernel_backend, &x_grad);
+  auto dense_x_grad = SetKernelOutput(&x_grad);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
@@ -1117,8 +1112,6 @@ std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
     (*kernel_fn)(
         *dev_ctx, *dense_out_grad, phi::Scalar(1.0), 0.0, true, dense_x_grad_t);
   }
-
-  return x_grad;
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
@@ -1250,7 +1243,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
   return api_output;
 }
 
-Tensor imag_grad_impl(const Tensor& out_grad) {
+void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
                             phi::dtype::ToComplex(out_grad.dtype())};
@@ -1264,8 +1257,7 @@ Tensor imag_grad_impl(const Tensor& out_grad) {
 
   auto dense_out_grad = TensorToDenseTensor(out_grad);
 
-  Tensor out;
-  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), x_grad);
   phi::MetaTensor meta_out(kernel_out);
   phi::RealAndImagGradInferMeta(*dense_out_grad, &meta_out);
 
@@ -1274,11 +1266,9 @@ Tensor imag_grad_impl(const Tensor& out_grad) {
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
-
-  return out;
 }
 
-Tensor real_grad_impl(const Tensor& out_grad) {
+void real_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
                             phi::dtype::ToComplex(out_grad.dtype())};
@@ -1292,8 +1282,7 @@ Tensor real_grad_impl(const Tensor& out_grad) {
 
   auto dense_out_grad = TensorToDenseTensor(out_grad);
 
-  Tensor out;
-  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), x_grad);
   phi::MetaTensor meta_out(kernel_out);
   phi::RealAndImagGradInferMeta(*dense_out_grad, &meta_out);
 
@@ -1302,8 +1291,6 @@ Tensor real_grad_impl(const Tensor& out_grad) {
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
-
-  return out;
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index 46abcd90de32a..d88a134654caf 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -96,20 +96,6 @@ Tensor conv2d_impl(const Tensor& input,
                    int workspace_size_MB,
                    bool exhaustive_search);
 
-std::vector<std::vector<Tensor>> conv2d_grad_impl(
-    const Tensor& input,
-    const Tensor& filter,
-    const Tensor& out_grad,
-    const std::vector<int>& strides,
-    const std::vector<int>& paddings,
-    const std::string& paddding_algorithm,
-    int groups,
-    const std::vector<int>& dilations,
-    const std::string& data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search);
-
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
 std::vector<Tensor> split_impl(const Tensor& x,
@@ -138,12 +124,28 @@ std::tuple<Tensor, Tensor> sgd_impl(
 
 ////////////////// Backward(grad) api impls //////////////////////
 
-std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
-                                    const Tensor& out_grad);
-
-Tensor imag_grad_impl(const Tensor& x);
-
-Tensor real_grad_impl(const Tensor& x);
+void add_n_grad_impl(const std::vector<Tensor>& x,
+                     const Tensor& out_grad,
+                     std::vector<Tensor*> x_grad);
+
+void conv2d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad);
+
+void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad);
+
+void real_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index fb205212ff371..2111829b8d60b 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -113,10 +113,13 @@ phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) {
 /* ------------------ for output ----------------------- */
 
 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
-  if (out->impl() == nullptr) {
-    out->set_impl(std::make_shared<phi::DenseTensor>());
+  if (out) {
+    if (out->impl() == nullptr) {
+      out->set_impl(std::make_shared<phi::DenseTensor>());
+    }
+    return static_cast<phi::DenseTensor*>(out->impl().get());
   }
-  return static_cast<phi::DenseTensor*>(out->impl().get());
+  return nullptr;
 }
 
 std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
@@ -133,6 +136,18 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
   return results;
 }
 
+std::vector<phi::DenseTensor*> SetKernelOutput(std::vector<Tensor*>* out) {
+  std::vector<phi::DenseTensor*> results(out->size(), nullptr);
+  for (size_t i = 0; i < out->size(); ++i) {
+    if (out->at(i)) {
+      auto tensor_ptr = std::make_shared<phi::DenseTensor>();
+      results[i] = tensor_ptr.get();
+      (*out)[i]->set_impl(tensor_ptr);
+    }
+  }
+  return results;
+}
+
 phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto select_rows = std::make_shared<phi::SelectedRows>();
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 47b80bb3fc290..7303e6b46114d 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -74,6 +74,9 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
                                                Backend backend,
                                                std::vector<Tensor>* out);
 
+// For backward api
+std::vector<phi::DenseTensor*> SetKernelOutput(std::vector<Tensor*>* out);
+
 phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
 
 phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
diff --git a/paddle/phi/api/lib/int_array.cc b/paddle/phi/api/lib/int_array.cc
new file mode 100644
index 0000000000000..503fc8184abf6
--- /dev/null
+++ b/paddle/phi/api/lib/int_array.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/int_array.h"
+
+#include "paddle/phi/api/lib/tensor_copy.h"
+#include "paddle/phi/common/place.h"
+
+namespace paddle {
+namespace experimental {
+
+template <>
+IntArrayBase<Tensor>::IntArrayBase(const Tensor& tensor) {  // NOLINT
+  is_from_tensor_ = true;
+  if (tensor.place().GetType() == phi::AllocationType::CPU) {
+    AssignDataFromTensor(tensor);
+  } else {
+    Tensor tensor_tmp;
+    copy(tensor, phi::CPUPlace(), true, &tensor_tmp);
+    AssignDataFromTensor(tensor_tmp);
+  }
+}
+
+template <>
+IntArrayBase<Tensor>::IntArrayBase(const std::vector<Tensor>& tensor_list) {
+  is_from_tensor_ = true;
+
+  for (size_t i = 0; i < tensor_list.size(); ++i) {
+    DataType data_type = tensor_list[i].dtype();
+    switch (data_type) {
+      case DataType::INT32:
+        if (tensor_list[i].place().GetType() == AllocationType::CPU) {
+          array_.push_back(*tensor_list[i].template data<int32_t>());
+        } else {
+          Tensor tensor_tmp;
+          copy(tensor_list[i], phi::CPUPlace(), true, &tensor_tmp);
+          array_.push_back(*tensor_tmp.template data<int32_t>());
+        }
+        break;
+      case DataType::INT64:
+        if (tensor_list[i].place().GetType() == AllocationType::CPU) {
+          array_.push_back(*tensor_list[i].template data<int64_t>());
+        } else {
+          Tensor tensor_tmp;
+          copy(tensor_list[i], phi::CPUPlace(), true, &tensor_tmp);
+          array_.push_back(*tensor_tmp.template data<int64_t>());
+        }
+        break;
+      default:
+        PD_THROW(
+            "Data type error. Currently, The data type of IntArrayBase "
+            "only supports Tensor with int32 and int64, "
+            "but now received `",
+            data_type,
+            "`.");
+    }
+  }
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index a7b89d7a4dca9..a340c0fed10d8 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -394,8 +394,8 @@ uint32_t Tensor::current_inplace_version() {
         static_cast<phi::DenseTensor *>(impl_.get())->InplaceVersionCounter();
     return inplace_version_counter.CurrentVersion();
   } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "current_inplace_version is only supported on DenseTensor now."));
+    LOG_FIRST_N(WARNING, 1)
+        << "current_inplace_version is only supported on DenseTensor now.";
   }
   return 0;
 }
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index de97e7516f619..5689b2d43a4f2 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor scalar)
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor int_array scalar)
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 5a6f1b1a7ee0c..c9fb2d3734edc 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -67,16 +67,9 @@ phi::IntArray MakePhiIntArray(const paddle::framework::Tensor& src) {
 }
 
 phi::IntArray MakePhiIntArrayFromVar(const framework::Variable& variable) {
-  auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
-    if (!platform::is_same_place(tensor.place(), expected_place)) {
-      framework::LoDTensor tmp_tensor;
-      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-      return MakePhiIntArray(tmp_tensor);
-    } else {
-      return MakePhiIntArray(tensor);
-    }
+    return MakePhiIntArray(tensor);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Unsupport casting input `%s` type to IntArray when call pt "
diff --git a/paddle/phi/backends/gpu/CMakeLists.txt b/paddle/phi/backends/gpu/CMakeLists.txt
index d14e94024f90f..ebe8f1ca4c101 100644
--- a/paddle/phi/backends/gpu/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/CMakeLists.txt
@@ -6,4 +6,5 @@ elseif(WITH_ROCM)
   hip_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda)
 endif()
 
-cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3)
+cc_library(gpu_resources SRCS gpu_resources.cc DEPS phi_device_context phi_gpu_info)
+cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3 gpu_resources)
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index ff238b7997865..e5d34376834dd 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
@@ -202,27 +203,65 @@ struct GPUContext::Impl {
   void Init() {
     owned_ = true;
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    InitGpuProperties();
-    InitStream();
+    phi::InitGpuProperties(place_,
+                           &compute_capability_,
+                           &runtime_version_,
+                           &driver_version_,
+                           &multi_process_,
+                           &max_threads_per_mp_,
+                           &max_threads_per_block_,
+                           &max_grid_dim_size_);
+    phi::InitStream(&stream_);
     InitEigenDevice();
-    InitBlasHandle();
-    InitBlasLtHandle();
-    InitDNNHandle();
-    InitSolverHandle();
-    InitSparseHandle();
+    phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+    phi::InitBlasLtHandle(&blaslt_handle_);
+    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    phi::InitSolverHandle(&solver_handle_, stream_);
+    phi::InitSparseHandle(&sparse_handle_, stream_);
     InitDnnWorkspace();
   }
 
   void PartialInitWithoutAllocator() {
     owned_ = true;
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    InitGpuProperties();
-    InitStream();
-    InitBlasHandle();
-    InitBlasLtHandle();
-    InitDNNHandle();
-    InitSolverHandle();
-    InitSparseHandle();
+    phi::InitGpuProperties(place_,
+                           &compute_capability_,
+                           &runtime_version_,
+                           &driver_version_,
+                           &multi_process_,
+                           &max_threads_per_mp_,
+                           &max_threads_per_block_,
+                           &max_grid_dim_size_);
+    phi::InitStream(&stream_);
+    phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+    phi::InitBlasLtHandle(&blaslt_handle_);
+    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    phi::InitSolverHandle(&solver_handle_, stream_);
+    phi::InitSparseHandle(&sparse_handle_, stream_);
   }
 
   void PartialInitWithAllocator() {
@@ -238,19 +277,23 @@ struct GPUContext::Impl {
 
   ~Impl() {
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    DestoryInternalWorkspace();
-    DestoryInternalEigenDevice();
-    DestroyInternalSparseHandle();
-    DestroyInternalSolverHandle();
-    DestroyInternalDnnHandle();
+    if (owned_) {
+      DestoryInternalWorkspace();
+      DestoryInternalEigenDevice();
+      phi::DestroySparseHandle(sparse_handle_);
+      phi::DestroySolverHandle(solver_handle_);
+      phi::DestroyDnnHandle(dnn_handle_);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    if (nccl_comm_) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
-    }
+      if (nccl_comm_) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
+      }
 #endif
-    DestroyInternalBlasHandle();
-    DestroyInternalBlasLtHandle();
-    DestoryInternalStream();
+      phi::DestroyBlasHandle(blas_handle_);
+      phi::DestroyBlasHandle(blas_tensor_core_handle_);
+      phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
+      phi::DestroyBlasLtHandle(blaslt_handle_);
+      phi::DestoryStream(stream_);
+    }
   }
 
   const Place& GetPlace() const { return place_; }
@@ -259,73 +302,6 @@ struct GPUContext::Impl {
     return blas_tensor_core_handle_ != nullptr;
   }
 
-  void InitGpuProperties() {
-    backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId());
-    compute_capability_ =
-        backends::gpu::GetGPUComputeCapability(place_.GetDeviceId());
-    multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId());
-    max_threads_per_mp_ =
-        backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
-    max_grid_dim_size_ =
-        backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId());
-    max_threads_per_block_ =
-        backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
-    driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId());
-    runtime_version_ =
-        backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId());
-
-    // TODO(wilber): glog may be replaced in the future?
-    LOG_FIRST_N(WARNING, 1)
-        << "Please NOTE: device: " << static_cast<int>(place_.device)
-        << ", GPU Compute Capability: " << compute_capability_ / 10 << "."
-        << compute_capability_ % 10
-        << ", Driver API Version: " << driver_version_ / 1000 << "."
-        << (driver_version_ % 100) / 10
-        << ", Runtime API Version: " << runtime_version_ / 1000 << "."
-        << (runtime_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-    size_t miopen_major, miopen_minor, miopen_patch;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
-    auto cudnn_dso_ver =
-        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
-    auto compile_miopen_version = MIOPEN_VERSION / 10;
-    if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with MIOPEN "
-          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
-          << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
-          << "." << cudnn_dso_ver % 100
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible MIOPEN "
-             "version.";
-    }
-#else
-    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-    LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                            << ", cuDNN Version: " << cudnn_dso_ver / 1000
-                            << "." << (cudnn_dso_ver % 1000) / 100 << ".";
-
-    // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-    if (local_cuda_version < compile_cuda_version) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with CUDA "
-          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-          << ", but CUDA runtime version in your machine is "
-          << local_cuda_version / 10 << "." << local_cuda_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible CUDA "
-             "version.";
-    }
-#endif
-  }
-
   void InitDnnWorkspace() {
     PD_CHECK(allocator_ != nullptr,
              "the device allocator for gpu context is nullptr.");
@@ -350,27 +326,6 @@ struct GPUContext::Impl {
     return DnnWorkspaceHandle(allocator_, stream_);
   }
 
-  void InitStream() {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0));
-#endif
-  }
-
-  void DestoryInternalStream() {
-    if (owned_ && stream_ != nullptr) {
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
-#endif
-    }
-    stream_ = nullptr;
-  }
-
   void SetStream(gpuStream_t stream) { stream_ = stream; }
 
   gpuStream_t GetStream() const {
@@ -400,55 +355,6 @@ struct GPUContext::Impl {
     return eigen_device_;
   }
 
-  void InitBlasHandle() {
-#ifdef PADDLE_WITH_HIP
-    phi::dynload::rocblas_create_handle(&blas_handle_);
-    phi::dynload::rocblas_set_stream(blas_handle_, stream_);
-#else  // PADDLE_WITH_CUDA
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(&blas_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_handle_, stream_));
-#if CUDA_VERSION >= 9000
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasCreate(&blas_tensor_core_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream_));
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#if CUDA_VERSION >= 11000
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasCreate(&blas_tf32_tensor_core_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_));
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif  // CUDA_VERSION >= 11000
-#endif  // CUDA_VERSION >= 9000
-#endif  // PADDLE_WITH_HIP
-  }
-
-  void DestroyInternalBlasHandle() {
-#ifdef PADDLE_WITH_HIP
-    if (owned_ && blas_handle_ != nullptr) {
-      phi::dynload::rocblas_destroy_handle(blas_handle_);
-      blas_handle_ = nullptr;
-    }
-#else
-    if (owned_ && blas_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_handle_);
-      blas_handle_ = nullptr;
-    }
-    if (owned_ && blas_tensor_core_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_tensor_core_handle_);
-      blas_tensor_core_handle_ = nullptr;
-    }
-    if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_tf32_tensor_core_handle_);
-      blas_tf32_tensor_core_handle_ = nullptr;
-    }
-#endif  // PADDLE_WITH_HIP
-  }
-
   blasHandle_t GetBlasHandle() const {
     PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
     return blas_handle_;
@@ -456,16 +362,12 @@ struct GPUContext::Impl {
 
   void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
 
-  void InitBlasLtHandle() {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-    phi::dynload::cublasLtCreate(&blaslt_handle_);
-#endif
+  void SetBlasTensorCoreHandle(blasHandle_t handle) {
+    blas_tensor_core_handle_ = handle;
   }
 
-  void DestroyInternalBlasLtHandle() {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-    phi::dynload::cublasLtDestroy(blaslt_handle_);
-#endif
+  void SetBlasTF32Handle(blasHandle_t handle) {
+    blas_tf32_tensor_core_handle_ = handle;
   }
 
   void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
@@ -475,53 +377,6 @@ struct GPUContext::Impl {
     return blaslt_handle_;
   }
 
-  void InitDNNHandle() {
-    if (phi::dynload::HasCUDNN()) {
-#ifdef PADDLE_WITH_HIP
-      size_t miopen_major, miopen_minor, miopen_patch;
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-          &miopen_major, &miopen_minor, &miopen_patch));
-      auto local_miopen_version =
-          (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
-      auto compile_miopen_version = MIOPEN_VERSION / 10;
-      if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with MIOPEN "
-            << compile_miopen_version / 100 << "."
-            << compile_miopen_version % 100
-            << ", but MIOPEN version in your machine is "
-            << local_miopen_version / 100 << "." << local_miopen_version % 100
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible MIOPEN "
-               "version.";
-      }
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::miopenSetStream(dnn_handle_, stream_));
-#else
-      auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
-      auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with CUDNN "
-            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
-            << ", but CUDNN version in your machine is "
-            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible CUDNN "
-               "version.";
-      }
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnCreate(&dnn_handle_));
-      PADDLE_RETRY_CUDA_SUCCESS(
-          phi::dynload::cudnnSetStream(dnn_handle_, stream_));
-#endif
-    } else {
-      dnn_handle_ = nullptr;
-    }
-  }
-
   dnnHandle_t GetDnnHandle() {
     PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
     return dnn_handle_;
@@ -543,24 +398,6 @@ struct GPUContext::Impl {
 
   void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
 
-  void InitSolverHandle() {
-#ifndef PADDLE_WITH_HIP
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(&solver_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cusolverDnSetStream(solver_handle_, stream_));
-#endif
-  }
-
-  void DestroyInternalSolverHandle() {
-#ifndef PADDLE_WITH_HIP
-    if (owned_ && solver_handle_ != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cusolverDnDestroy(solver_handle_));
-      solver_handle_ = nullptr;
-    }
-#endif
-  }
-
   solverHandle_t GetSolverHandle() const {
     PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
     return solver_handle_;
@@ -568,29 +405,6 @@ struct GPUContext::Impl {
 
   void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
 
-  void InitSparseHandle() {
-// ROCM is not yet supported
-#if defined(PADDLE_WITH_CUDA)
-// The generic APIs is supported from CUDA10.1
-#if CUDA_VERSION >= 10010
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        dynload::cusparseSetStream(sparse_handle_, stream_));
-#endif
-#endif
-  }
-
-  void DestroyInternalSparseHandle() {
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10010
-    if (owned_ && sparse_handle_ != nullptr) {
-      PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_));
-      sparse_handle_ = nullptr;
-    }
-#endif
-#endif
-  }
-
   sparseHandle_t GetSparseHandle() const {
     PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
     return sparse_handle_;
@@ -878,7 +692,10 @@ void GPUContext::Init() {
   impl_->Init();
 }
 
-void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); }
+void GPUContext::SetStream(gpuStream_t stream) {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->SetStream(stream);
+}
 
 void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
   impl_->SetEigenDevice(device);
@@ -888,6 +705,14 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) {
   impl_->SetBlasHandle(blas);
 }
 
+void GPUContext::SetBlasTensorCoreHandle(blasHandle_t handle) {
+  impl_->SetBlasTensorCoreHandle(handle);
+}
+
+void GPUContext::SetBlasTF32Handle(blasHandle_t handle) {
+  impl_->SetBlasTF32Handle(handle);
+}
+
 void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
   impl_->SetBlasLtHandle(blaslt);
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 8d44acaa4a083..db9f287041dfb 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -199,6 +199,10 @@ class PADDLE_API GPUContext : public DeviceContext {
 
   void SetBlasHandle(blasHandle_t);
 
+  void SetBlasTensorCoreHandle(blasHandle_t);
+
+  void SetBlasTF32Handle(blasHandle_t);
+
   void SetBlasLtHandle(blasLtHandle_t);
 
   void SetDnnHandle(dnnHandle_t);
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
new file mode 100644
index 0000000000000..268024eb25949
--- /dev/null
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/dynload/cublas.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/dynload/cusparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include "paddle/phi/backends/dynload/nccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#endif  // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// TODO(phi): remove fluid header.
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+
+void InitGpuProperties(Place place,
+                       int* compute_capability,
+                       int* runtime_version,
+                       int* driver_version,
+                       int* multi_process,
+                       int* max_threads_per_mp,
+                       int* max_threads_per_block,
+                       std::array<int, 3>* max_grid_dim_size) {
+  backends::gpu::GPUDeviceGuard guard(place.GetDeviceId());
+  *compute_capability =
+      backends::gpu::GetGPUComputeCapability(place.GetDeviceId());
+  *multi_process = backends::gpu::GetGPUMultiProcessors(place.GetDeviceId());
+  *max_threads_per_mp =
+      backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place.GetDeviceId());
+  *max_grid_dim_size = backends::gpu::GetGpuMaxGridDimSize(place.GetDeviceId());
+  *max_threads_per_block =
+      backends::gpu::GetGPUMaxThreadsPerBlock(place.GetDeviceId());
+  *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId());
+  *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId());
+
+  // TODO(wilber): glog may be replaced in the future?
+  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
+                          << static_cast<int>(place.device)
+                          << ", GPU Compute Capability: "
+                          << *compute_capability / 10 << "."
+                          << *compute_capability % 10
+                          << ", Driver API Version: " << *driver_version / 1000
+                          << "." << (*driver_version % 100) / 10
+                          << ", Runtime API Version: "
+                          << *runtime_version / 1000 << "."
+                          << (*runtime_version % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+  size_t miopen_major, miopen_minor, miopen_patch;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+  auto cudnn_dso_ver =
+      (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+  auto compile_miopen_version = MIOPEN_VERSION / 10;
+  if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with MIOPEN "
+        << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+        << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
+        << "." << cudnn_dso_ver % 100
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible MIOPEN "
+           "version.";
+  }
+#else
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
+                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
+                          << (cudnn_dso_ver % 1000) / 100 << ".";
+
+  // Check CUDA/CUDNN version compatiblity
+  auto local_cuda_version =
+      (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
+  auto compile_cuda_version =
+      (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+  if (local_cuda_version < compile_cuda_version) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with CUDA "
+        << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+        << ", but CUDA runtime version in your machine is "
+        << local_cuda_version / 10 << "." << local_cuda_version % 10
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible CUDA "
+           "version.";
+  }
+#endif
+}
+
+void InitStream(gpuStream_t* stream) {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipStreamCreateWithPriority(stream, hipStreamDefault, 0));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0));
+#endif
+}
+
+void DestoryStream(gpuStream_t stream) {
+  if (stream != nullptr) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
+#endif
+  }
+  stream = nullptr;
+}
+
+void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  phi::dynload::rocblas_create_handle(blas_handle);
+  phi::dynload::rocblas_set_stream(*blas_handle, stream);
+#else   // PADDLE_WITH_CUDA
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::cublasSetStream(*blas_handle, stream));
+#endif  // PADDLE_WITH_HIP
+}
+
+void DestroyBlasHandle(blasHandle_t handle) {
+#ifdef PADDLE_WITH_HIP
+  if (handle != nullptr) {
+    phi::dynload::rocblas_destroy_handle(handle);
+    handle = nullptr;
+  }
+#else
+  if (handle != nullptr) {
+    phi::dynload::cublasDestroy(handle);
+    handle = nullptr;
+  }
+#endif  // PADDLE_WITH_HIP
+}
+
+void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+  phi::dynload::cublasLtCreate(blaslt_handle);
+#endif
+}
+
+void DestroyBlasLtHandle(blasLtHandle_t handle) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+  if (handle != nullptr) {
+    phi::dynload::cublasLtDestroy(handle);
+    handle = nullptr;
+  }
+#endif
+}
+
+void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
+  if (phi::dynload::HasCUDNN()) {
+#ifdef PADDLE_WITH_HIP
+    size_t miopen_major, miopen_minor, miopen_patch;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+    auto local_miopen_version =
+        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+    auto compile_miopen_version = MIOPEN_VERSION / 10;
+    if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place.device
+          << ". The installed Paddle is compiled with MIOPEN "
+          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+          << ", but MIOPEN version in your machine is "
+          << local_miopen_version / 100 << "." << local_miopen_version % 100
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible MIOPEN "
+             "version.";
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
+#else
+    auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
+    auto compile_cudnn_version = CUDNN_VERSION / 100;
+    if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place.device
+          << ". The installed Paddle is compiled with CUDNN "
+          << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+          << ", but CUDNN version in your machine is "
+          << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDNN "
+             "version.";
+    }
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnCreate(handle));
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(*handle, stream));
+#endif
+  } else {
+    *handle = nullptr;
+  }
+}
+
+void DestroyDnnHandle(dnnHandle_t handle) {
+#ifdef PADDLE_WITH_HIP
+  if (handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle));
+    handle = nullptr;
+  }
+#else
+  if (handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
+    handle = nullptr;
+  }
+#endif  // PADDLE_WITH_HIP
+}
+
+void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
+#ifndef PADDLE_WITH_HIP
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
+#endif
+}
+
+void DestroySolverHandle(solverHandle_t solver_handle) {
+#ifndef PADDLE_WITH_HIP
+  if (solver_handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
+    solver_handle = nullptr;
+  }
+#endif
+}
+
+void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
+// ROCM is not yet supported
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+  PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
+#endif
+#endif
+}
+
+void DestroySparseHandle(sparseHandle_t handle) {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10010
+  if (handle != nullptr) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle));
+    handle = nullptr;
+  }
+#endif
+#endif
+}
+
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
new file mode 100644
index 0000000000000..07ccb6215409a
--- /dev/null
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <array>
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/common/place.h"
+
+namespace phi {
+
+void InitGpuProperties(Place place,
+                       int* compute_capability,
+                       int* runtime_version,
+                       int* driver_version,
+                       int* multi_process,
+                       int* max_threads_per_mp,
+                       int* max_threads_per_block,
+                       std::array<int, 3>* max_grid_dim_size);
+
+void InitStream(gpuStream_t* stream);
+void DestoryStream(gpuStream_t stream);
+
+void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream);
+void DestroyBlasHandle(blasHandle_t handle);
+
+void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
+void DestroyBlasLtHandle(blasLtHandle_t handle);
+
+void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place);
+void DestroyDnnHandle(dnnHandle_t handle);
+
+void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
+void DestroySolverHandle(solverHandle_t solver_handle);
+
+void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream);
+void DestroySparseHandle(sparseHandle_t handle);
+
+// void InitDnnWorkspace();
+
+}  // namespace phi
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index aa839eab587cb..b1ca4d1f8a8c6 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1,2 +1,3 @@
 cc_library(phi_place SRCS place.cc)
 cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor)
+cc_library(int_array SRCS int_array.cc DEPS phi_enforce tensor)
diff --git a/paddle/phi/common/int_array.cc b/paddle/phi/common/int_array.cc
new file mode 100644
index 0000000000000..daed2b6625a9e
--- /dev/null
+++ b/paddle/phi/common/int_array.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/int_array.h"
+
+#include "paddle/phi/common/place.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace experimental {
+
+template <>
+IntArrayBase<phi::DenseTensor>::IntArrayBase(
+    const phi::DenseTensor& tensor) {  // NOLINT
+  is_from_tensor_ = true;
+  if (tensor.place().GetType() == AllocationType::CPU) {
+    AssignDataFromTensor(tensor);
+  } else {
+    phi::DenseTensor tensor_tmp;
+    paddle::framework::TensorCopySync(tensor, CPUPlace(), &tensor_tmp);
+    AssignDataFromTensor(tensor_tmp);
+  }
+}
+
+template <>
+IntArrayBase<phi::DenseTensor>::IntArrayBase(
+    const std::vector<phi::DenseTensor>& tensor_list) {
+  is_from_tensor_ = true;
+
+  for (size_t i = 0; i < tensor_list.size(); ++i) {
+    DataType data_type = tensor_list[i].dtype();
+    switch (data_type) {
+      case DataType::INT32:
+        if (tensor_list[i].place().GetType() == AllocationType::CPU) {
+          array_.push_back(*tensor_list[i].template data<int32_t>());
+        } else {
+          phi::DenseTensor tensor_tmp;
+          paddle::framework::TensorCopySync(
+              tensor_list[i], CPUPlace(), &tensor_tmp);
+          array_.push_back(*tensor_tmp.template data<int32_t>());
+        }
+        break;
+      case DataType::INT64:
+        if (tensor_list[i].place().GetType() == AllocationType::CPU) {
+          array_.push_back(*tensor_list[i].template data<int64_t>());
+        } else {
+          phi::DenseTensor tensor_tmp;
+          paddle::framework::TensorCopySync(
+              tensor_list[i], CPUPlace(), &tensor_tmp);
+          array_.push_back(*tensor_tmp.template data<int64_t>());
+        }
+        break;
+      default:
+        PD_THROW(
+            "Data type error. Currently, The data type of IntArrayBase "
+            "only supports Tensor with int32 and int64, "
+            "but now received `",
+            data_type,
+            "`.");
+    }
+  }
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/common/int_array.h b/paddle/phi/common/int_array.h
index f9d07249e0fc9..91b9ace136bc2 100644
--- a/paddle/phi/common/int_array.h
+++ b/paddle/phi/common/int_array.h
@@ -48,50 +48,10 @@ class IntArrayBase {
   void SetFromTensor(bool val) { is_from_tensor_ = val; }
 
   // The Tensor must have one dim
-  IntArrayBase(const T& tensor) {  // NOLINT
-    is_from_tensor_ = true;
-    size_t n = tensor.numel();
-    array_.reserve(n);
-    switch (tensor.dtype()) {
-      case DataType::INT32:
-        AssignData(tensor.template data<int32_t>(), n);
-        break;
-      case DataType::INT64:
-        AssignData(tensor.template data<int64_t>(), n);
-        break;
-      default:
-        PD_THROW(
-            "Data type error. Currently, The data type of IntArrayBase "
-            "only supports Tensor with int32 and int64, "
-            "but now received `",
-            tensor.dtype(),
-            "`.");
-    }
-  }
+  IntArrayBase(const T& tensor);  // NOLINT
 
   // The Tensor in vec must have only one element
-  IntArrayBase(const std::vector<T>& tensor_list) {  // NOLINT
-    is_from_tensor_ = true;
-
-    for (size_t i = 0; i < tensor_list.size(); ++i) {
-      DataType data_type = tensor_list[i].dtype();
-      switch (data_type) {
-        case DataType::INT32:
-          array_.push_back(*tensor_list[i].template data<int32_t>());
-          break;
-        case DataType::INT64:
-          array_.push_back(*tensor_list[i].template data<int64_t>());
-          break;
-        default:
-          PD_THROW(
-              "Data type error. Currently, The data type of IntArrayBase "
-              "only supports Tensor with int32 and int64, "
-              "but now received `",
-              data_type,
-              "`.");
-      }
-    }
-  }
+  IntArrayBase(const std::vector<T>& tensor_list);  // NOLINT
 
   template <typename OtherT>
   IntArrayBase(const IntArrayBase<OtherT>& other) : array_(other.GetData()) {}
@@ -114,6 +74,26 @@ class IntArrayBase {
     }
   }
 
+  void AssignDataFromTensor(const T& tensor) {
+    size_t n = tensor.numel();
+    array_.reserve(n);
+    switch (tensor.dtype()) {
+      case DataType::INT32:
+        AssignData(tensor.template data<int32_t>(), n);
+        break;
+      case DataType::INT64:
+        AssignData(tensor.template data<int64_t>(), n);
+        break;
+      default:
+        PD_THROW(
+            "Data type error. Currently, The data type of IntArrayBase "
+            "only supports Tensor with int32 and int64, "
+            "but now received `",
+            tensor.dtype(),
+            "`.");
+    }
+  }
+
  private:
   // TODO(zhangyunfei) Replace std::vector with a more efficient container
   // structure.
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 2139605fb2048..837a43905e723 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1534,7 +1534,7 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
                     phi::errors::InvalidArgument(
                         "X's second dimension is expected to be equal to "
                         "Vec's first dimension"
-                        "but recieved X'shape = [%s], Vec's shape = [%s]",
+                        "but received X'shape = [%s], Vec's shape = [%s]",
                         dim_x,
                         dim_vec));
 
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 519d21b323fc2..48c40673ab819 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -458,7 +458,7 @@ void BatchNormInferMeta(const MetaTensor& x,
         true,
         phi::errors::InvalidArgument(
             "Each dimension of input tensor is expected to be -1 or a "
-            "positive number, but recieved %d. Input's shape is [%s].",
+            "positive number, but received %d. Input's shape is [%s].",
             x_dims[i],
             x_dims));
   }
@@ -755,7 +755,7 @@ inline int ConvOutputSize(
       0,
       phi::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
           "stride + 1), where input_size is %d, padding is %d, "
           "filter_size is %d, dilation is %d, stride is %d.",
@@ -1998,7 +1998,9 @@ void StackInferMeta(const std::vector<const MetaTensor*>& x,
 void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
                              std::vector<MetaTensor*> out) {
   for (size_t i = 0; i < x.size(); ++i) {
-    out[i]->share_meta(*x[i]);
+    if (out[i]) {
+      out[i]->share_meta(*x[i]);
+    }
   }
 }
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 6d37a31f54562..6c2956417a3a3 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2746,7 +2746,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       phi::errors::InvalidArgument(
           "The dims of X should be larger than that of kernel_sizes "
           "by a number of 2, due to the batch size and input channel dim. "
-          "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
+          "But received dims(X:%u) - dims(kernel_sizes:%u) != 2",
           in_dims.size(),
           kernel_sizes.size()));
   PADDLE_ENFORCE_EQ(
@@ -2754,7 +2754,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       kernel_sizes.size(),
       phi::errors::InvalidArgument(
           "The dims of strides should be the same with that of kernel_sizes. "
-          "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
+          "But received dims(strides: %u) != dims(kernel_sizes: %u).",
           strides.size(),
           kernel_sizes.size()));
   PADDLE_ENFORCE_EQ(
@@ -2762,7 +2762,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       2 * strides.size(),
       phi::errors::InvalidArgument(
           "The dims of paddings should be 2 times of that of strides. "
-          "But recieved dims(paddings: %u) != 2*dims(strides: %u).",
+          "But received dims(paddings: %u) != 2*dims(strides: %u).",
           paddings.size(),
           strides.size()));
   PADDLE_ENFORCE_EQ(
@@ -2770,7 +2770,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       dilations.size(),
       phi::errors::InvalidArgument(
           "The dims of strides should be the same with that of dilations. "
-          "But recieved dims(strides: %u) != dims(dilations: %u).",
+          "But received dims(strides: %u) != dims(dilations: %u).",
           strides.size(),
           dilations.size()));
 
@@ -2779,14 +2779,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                     0,
                     phi::errors::InvalidArgument(
                         "The `kernel_sizes` should be greater than zero, "
-                        "but recieved kernel_height: %d kernel_width: %d.",
+                        "but received kernel_height: %d kernel_width: %d.",
                         kernel_sizes[0],
                         kernel_sizes[1]));
   PADDLE_ENFORCE_GT(kernel_sizes[1],
                     0,
                     phi::errors::InvalidArgument(
                         "The `kernel_sizes` should be greater than zero, "
-                        "but recieved kernel_height: %d kernel_width: %d.",
+                        "but received kernel_height: %d kernel_width: %d.",
                         kernel_sizes[0],
                         kernel_sizes[1]));
   // check strides
@@ -2794,14 +2794,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                     0,
                     phi::errors::InvalidArgument(
                         "The `strides` should be greater than zero, "
-                        "but recieved strides_height: %d strides_width: %d.",
+                        "but received strides_height: %d strides_width: %d.",
                         strides[0],
                         strides[1]));
   PADDLE_ENFORCE_GT(strides[1],
                     0,
                     phi::errors::InvalidArgument(
                         "The `strides` should be greater than zero, "
-                        "but recieved strides_height: %d strides_width: %d.",
+                        "but received strides_height: %d strides_width: %d.",
                         strides[0],
                         strides[1]));
   // check dilations
@@ -2810,7 +2810,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       0,
       phi::errors::InvalidArgument(
           "The `dilations` should be greater than zero, "
-          "but recieved dilations_height: %d dilations_width: %d.",
+          "but received dilations_height: %d dilations_width: %d.",
           dilations[0],
           dilations[1]));
   PADDLE_ENFORCE_GT(
@@ -2818,7 +2818,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       0,
       phi::errors::InvalidArgument(
           "The `dilations` should be greater than zero, "
-          "but recieved dilations_height: %d dilations_width: %d.",
+          "but received dilations_height: %d dilations_width: %d.",
           dilations[0],
           dilations[1]));
 
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index fd42756ba3867..084843c31cf52 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -187,6 +187,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt);
 
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Round);
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Floor);
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
index d26d89086b27e..159a5cfbeb6b4 100644
--- a/paddle/phi/kernels/cpu/conv_util.h
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -38,7 +38,7 @@ inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
         phi::errors::InvalidArgument(
             "Attribute padding's size should be the same or twice as the "
             "input's dimension. "
-            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "But received: padding's size is %d, padding is [%s]; input's "
             "dimension is %d, input's shape is [%s].",
             paddings->size(),
             make_ddim(*paddings),
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index 3f5e0b8a4d8ee..ee384cc75193c 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -88,6 +88,16 @@ PD_REGISTER_KERNEL(minimum_grad,
                    int,
                    int64_t,
                    phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(elementwise_heaviside_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(elementwise_pow_grad,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 7478f69d915f1..286b0d0ffaad9 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -95,6 +95,18 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
       dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
 }
 
+template <typename T, typename Context>
+void ElementwiseHeavisideRawKernel(const Context& dev_ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& y,
+                                   int axis,
+                                   DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::ElementwiseHeavisideFunctor<T>, T>(
+      dev_ctx, x, y, axis, funcs::ElementwiseHeavisideFunctor<T>(), out);
+}
+
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -149,3 +161,11 @@ PD_REGISTER_KERNEL(elementwise_pow_raw,
                    double,
                    int,
                    int64_t) {}
+PD_REGISTER_KERNEL(elementwise_heaviside_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index cae97eb076453..ae2c7a72635f7 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -808,7 +808,7 @@ struct BidirLayer : public Layer<T, CellType> {
                   mode,
                   is_test);
 
-    // concat the the output result
+    // concat the output result
     funcs::ConcatFunctor<CPUContext, T> concat_functor;
     concat_functor(dev_ctx, output_vec, static_cast<int>(2), output);
   }
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index 6f2f2915ecf9e..b1e6ecaee6746 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -55,6 +55,15 @@ void MinimumGradKernel(const Context& dev_ctx,
                        DenseTensor* dx,
                        DenseTensor* dy);
 
+template <typename T, typename Context>
+void ElementwiseHeavisideGradKernel(const Context& dev_ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& y,
+                                    const DenseTensor& dout,
+                                    int axis,
+                                    DenseTensor* dx,
+                                    DenseTensor* dy);
+
 template <typename T, typename Context>
 void ElementwisePowGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 9d608cd86a6f7..5e29eb5ace675 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -64,6 +64,15 @@ void ElementwisePowKernel(const Context& dev_ctx,
   ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void ElementwiseHeavisideKernel(const Context& dev_ctx,
+                                const DenseTensor& x,
+                                const DenseTensor& y,
+                                DenseTensor* out) {
+  int axis = -1;
+  ElementwiseHeavisideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -91,6 +100,14 @@ PD_REGISTER_KERNEL(
     modulo, CPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 PD_REGISTER_KERNEL(
     floor_divide, CPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_heaviside,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
                    CPU,
                    ALL_LAYOUT,
@@ -126,6 +143,14 @@ PD_REGISTER_KERNEL(
     modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 PD_REGISTER_KERNEL(
     floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_heaviside,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
                    KPS,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index 37fe895d4051f..a39da52e7e3b5 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -98,6 +98,19 @@ void ElementwisePowKernel(const Context& dev_ctx,
                           const DenseTensor& y,
                           DenseTensor* out);
 
+template <typename T, typename Context>
+void ElementwiseHeavisideRawKernel(const Context& dev_ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& y,
+                                   int axis,
+                                   DenseTensor* out);
+
+template <typename T, typename Context>
+void ElementwiseHeavisideKernel(const Context& dev_ctx,
+                                const DenseTensor& x,
+                                const DenseTensor& y,
+                                DenseTensor* out);
+
 template <typename T, typename Context>
 DenseTensor Maximum(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -142,6 +155,17 @@ DenseTensor FloorDivide(const Context& dev_ctx,
   return dense_out;
 }
 
+template <typename T, typename Context>
+DenseTensor ElementwiseHeaviside(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  ElementwiseHeavisideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
 template <typename T, typename Context>
 DenseTensor ElementwisePow(const Context& dev_ctx,
                            const DenseTensor& x,
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index aafa40a3d01bf..b473d68b68ba9 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -53,7 +53,7 @@ struct DimensionsTransform {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "The %d-th dimension of input tensor is expected to be equal "
                 "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
+                "received %d.",
                 in_idx + 1,
                 axis + 1,
                 out_dims[axis],
@@ -70,7 +70,7 @@ struct DimensionsTransform {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "The %d-th dimension of input tensor is expected to be equal "
                 "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
+                "received %d.",
                 in_idx + 1,
                 in_idx + 1,
                 out_dims[in_idx],
@@ -223,22 +223,54 @@ struct DimensionsTransform {
   }
 };
 
-template <typename T, int VecSize, int Rank, bool IsBoundary = false>
+template <typename InT, typename OutT, int NumOuts = 1>
+int GetVecsize(const std::vector<const DenseTensor *> &ins,
+               std::vector<DenseTensor *> *outs) {
+  int in_vec_size = 4;
+  int out_vec_size = 4;
+  if (NumOuts > 1) {
+    for (int i = 0; i < NumOuts; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*outs)[i]->dims(),
+          (*outs)[0]->dims(),
+          phi::errors::InvalidArgument(
+              "The shape of each output tensor shall be identical yet, but "
+              "%d-th output tensor`s shape is not.",
+              i));
+      out_vec_size = std::min(
+          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
+    }
+  } else {
+    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+  }
+
+  for (auto *in : ins) {
+    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
+    in_vec_size = in->dims() == (*outs)[0]->dims()
+                      ? std::min(temp_size, in_vec_size)
+                      : in_vec_size;
+  }
+  return std::min(out_vec_size, in_vec_size);
+}
+
+template <typename T, int VecSize, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
     T *dst,
     const _ptr_ T *src,
     uint32_t block_offset,
-    const kps::details::BroadcastConfig<Rank> &config,
+    const kps::details::BroadcastConfig &config,
     int numel,
     int num,
-    int need_broadcast) {
+    int need_broadcast,
+    int read_lens) {
   // numel : whole num of output
   // num: how many data will be deal with in this time
   if (need_broadcast) {
-    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(
-        dst, src, block_offset, config, numel);
+    kps::ReadDataBc<T, VecSize, 1, 1, IsBoundary>(
+        dst, src, block_offset, config, numel, read_lens);
   } else {
-    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
+    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(
+        dst, src + block_offset, num, read_lens);
   }
 }
 
@@ -248,30 +280,31 @@ template <typename InT,
           int Arity,
           int NumOuts,
           int VecSize,
-          int Rank,
           bool IsBoundary = false>
 __device__ void VectorizedBroadcastKernelImpl(
     const phi::Array<const _ptr_ InT *__restrict__, Arity> &ins,
     phi::Array<_ptr_ OutT *, NumOuts> outs,
     const phi::Array<int, Arity> &use_broadcast,
     uint32_t numel,
-    const phi::Array<kps::details::BroadcastConfig<Rank>, Arity> &configs,
+    const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
     int num,
     int block_offset,
+    int read_lens,
     Functor func) {
-  InT args[Arity][VecSize];
-  ConditionalT<OutT, NumOuts> result[VecSize];
+  __simd__ InT args[Arity][VecSize];
+  __simd__ ConditionalT<OutT, NumOuts> result[VecSize];
 
 #pragma unroll
   for (int i = 0; i < Arity; i++) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    LoadData<InT, VecSize, Rank, IsBoundary>(args[i],
-                                             ins[i],
-                                             block_offset,
-                                             configs[i],
-                                             numel,
-                                             num,
-                                             use_broadcast[i]);
+    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f), read_lens);
+    LoadData<InT, VecSize, IsBoundary>(args[i],
+                                       ins[i],
+                                       block_offset,
+                                       configs[i],
+                                       numel,
+                                       num,
+                                       use_broadcast[i],
+                                       read_lens);
   }
   constexpr bool kCallElementwiseAny =
       paddle::platform::FunctionTraits<Functor>::has_pointer_args;
@@ -281,10 +314,10 @@ __device__ void VectorizedBroadcastKernelImpl(
                                          Functor,
                                          Arity,
                                          kCallElementwiseAny>()(
-      func, args, result);
-
-  phi::funcs::ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
-      outs, result, block_offset, num);
+      func, args, result, read_lens);
+  phi::funcs::
+      ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()(
+          outs, result, block_offset, num, read_lens);
 }
 
 template <typename InT,
@@ -292,19 +325,19 @@ template <typename InT,
           typename Functor,
           int Arity,
           int NumOuts,
-          int VecSize,
-          int Rank>
+          int VecSize>
 __global__ void VectorizedBroadcastKernel(
     phi::Array<const _ptr_ InT *__restrict__, Arity> ins,
     phi::Array<_ptr_ OutT *, NumOuts> outs,
     phi::Array<int, Arity> use_broadcast,
     uint32_t numel,
-    phi::Array<kps::details::BroadcastConfig<Rank>, Arity> configs,
+    phi::Array<kps::details::BroadcastConfig, Arity> configs,
     int main_offset,
     int tail_tid,
+    int read_lens,
     Functor func) {
-  int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int block_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
 
 #ifdef PADDLE_WITH_XPU_KP
   for (; block_offset < main_offset; block_offset += stride) {
@@ -314,14 +347,14 @@ __global__ void VectorizedBroadcastKernel(
                                   Arity,
                                   NumOuts,
                                   VecSize,
-                                  Rank,
                                   false>(ins,
                                          outs,
                                          use_broadcast,
                                          numel,
                                          configs,
-                                         BLOCK_NUM_X * VecSize,
+                                         BLOCK_NUM_X * read_lens,
                                          block_offset,
+                                         read_lens,
                                          func);
   }
   int num = numel - block_offset;
@@ -332,9 +365,15 @@ __global__ void VectorizedBroadcastKernel(
                                   Arity,
                                   NumOuts,
                                   VecSize,
-                                  Rank,
-                                  true>(
-        ins, outs, use_broadcast, numel, configs, num, block_offset, func);
+                                  true>(ins,
+                                        outs,
+                                        use_broadcast,
+                                        numel,
+                                        configs,
+                                        num,
+                                        block_offset,
+                                        read_lens,
+                                        func);
   }
 #else
   if (block_offset < main_offset) {
@@ -344,7 +383,6 @@ __global__ void VectorizedBroadcastKernel(
                                   Arity,
                                   NumOuts,
                                   VecSize,
-                                  Rank,
                                   false>(ins,
                                          outs,
                                          use_broadcast,
@@ -352,6 +390,7 @@ __global__ void VectorizedBroadcastKernel(
                                          configs,
                                          BLOCK_NUM_X * VecSize,
                                          block_offset,
+                                         read_lens,
                                          func);
   } else {
     VectorizedBroadcastKernelImpl<InT,
@@ -360,9 +399,15 @@ __global__ void VectorizedBroadcastKernel(
                                   Arity,
                                   NumOuts,
                                   VecSize,
-                                  Rank,
-                                  true>(
-        ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
+                                  true>(ins,
+                                        outs,
+                                        use_broadcast,
+                                        numel,
+                                        configs,
+                                        tail_tid,
+                                        block_offset,
+                                        read_lens,
+                                        func);
   }
 #endif
 }
@@ -372,15 +417,14 @@ template <typename InT,
           typename Functor,
           int Arity,
           int NumOuts,
-          int VecSize,
-          int Rank>
-void LaunchBroadcastKernel(const KPDevice &ctx,
-                           const std::vector<const DenseTensor *> &ins,
-                           std::vector<DenseTensor *> *outs,
-                           Functor func,
-                           DimensionsTransform merge_dims) {
+          int VecSize>
+void LaunchBroadcastKernel(
+    const KPDevice &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    Functor func,
+    const phi::Array<kps::details::BroadcastConfig, Arity> &configs) {
   int numel = (*outs)[0]->numel();
-  phi::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
   phi::Array<int, Arity> use_broadcast;
   phi::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
   phi::Array<_ptr_ OutT *, NumOuts> outs_data;
@@ -392,96 +436,41 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
   for (int i = 0; i < Arity; i++) {
     use_broadcast[i] = (ins[i]->numel() != numel);
     ins_data[i] = (const _ptr_ InT *)(ins[i]->data<InT>());
-    if (use_broadcast[i]) {
-      // get the broadcast config,
-      // if data shape is[m, n], then you should set data_dim = {n, m}
-      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
-      configs[i] = kps::details::BroadcastConfig<Rank>(
-          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
-    }
   }
 
 #ifdef PADDLE_WITH_XPU_KP
   const int threads = 64;
   const int blocks = 8;
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
+  int read_lens = configs[0].buf_len;
   auto stream = ctx.x_context()->xpu_stream;
-  VectorizedBroadcastKernel<InT,
-                            OutT,
-                            Functor,
-                            Arity,
-                            NumOuts,
-                            VecSize,
-                            Rank><<<blocks, threads, stream>>>(ins_data,
-                                                               outs_data,
-                                                               use_broadcast,
-                                                               numel,
-                                                               configs,
-                                                               main_offset,
-                                                               tail_tid,
-                                                               func);
+  int main_offset = (numel / (read_lens * threads)) * read_lens * threads;
+  int tail_tid = numel % (read_lens * threads);
 #else
-  const int threads = 256;
-  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
+  auto gpu_config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
+  int read_lens = VecSize;
   auto stream = ctx.stream();
+  auto threads = gpu_config.thread_per_block;
+  auto blocks = gpu_config.block_per_grid;
+  int main_offset = (numel / (read_lens * gpu_config.GetBlockSize())) *
+                    read_lens * gpu_config.GetBlockSize();
+  int tail_tid = numel % (read_lens * gpu_config.GetBlockSize());
+#endif
   VectorizedBroadcastKernel<InT,
                             OutT,
                             Functor,
                             Arity,
                             NumOuts,
-                            VecSize,
-                            Rank><<<blocks, threads, 0, stream>>>(ins_data,
-                                                                  outs_data,
-                                                                  use_broadcast,
-                                                                  numel,
-                                                                  configs,
-                                                                  main_offset,
-                                                                  tail_tid,
-                                                                  func);
-#endif
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
-void BroadcastKernelForDifferentDimSize(
-    const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
-  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
-
-#define CALL_BROADCAST_FOR_DIM_SIZE(rank)                                     \
-  case rank: {                                                                \
-    LaunchBroadcastKernel<InT, OutT, Functor, Arity, NumOuts, VecSize, rank>( \
-        ctx, ins, outs, func, merge_dims);                                    \
-  } break;
-
-  switch (merge_dims.dim_size) {
-    CALL_BROADCAST_FOR_DIM_SIZE(1);
-    CALL_BROADCAST_FOR_DIM_SIZE(2);
-    CALL_BROADCAST_FOR_DIM_SIZE(3);
-    CALL_BROADCAST_FOR_DIM_SIZE(4);
-    CALL_BROADCAST_FOR_DIM_SIZE(5);
-    CALL_BROADCAST_FOR_DIM_SIZE(6);
-    CALL_BROADCAST_FOR_DIM_SIZE(7);
-    CALL_BROADCAST_FOR_DIM_SIZE(8);
-    default: {
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "The maximum dimension of input tensor is expected to be less than "
-          "%d, but recieved %d.",
-          merge_dims.dim_size,
-          phi::DDim::kMaxRank));
-    }
-  }
-#undef CALL_BROADCAST_FOR_DIM_SIZE
+                            VecSize><<<blocks, threads, 0, stream>>>(
+      ins_data,
+      outs_data,
+      use_broadcast,
+      numel,
+      configs,
+      main_offset,
+      tail_tid,
+      read_lens,
+      func);
 }
 
 template <ElementwiseType ET,
@@ -498,79 +487,82 @@ void BroadcastKernelForDifferentVecSize(
   using Traits = paddle::platform::FunctionTraits<Functor>;
   const int kArity =
       Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(ins.size(),
-                    kArity,
-                    phi::errors::InvalidArgument(
-                        "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
-                        "is %d, the arity of functor is %d.",
-                        ins.size(),
-                        kArity));
-  PADDLE_ENFORCE_LE(kArity,
-                    3,
-                    phi::errors::InvalidArgument(
-                        "Currently only broadcast of ternary is supported "
-                        "and verified, but received %d.",
-                        kArity));
-  PADDLE_ENFORCE_EQ(outs->size(),
-                    NumOuts,
-                    phi::errors::InvalidArgument(
-                        "Number of outputs shall equal to number of functions, "
-                        "but number of outputs is %d, of functions is %d.",
-                        outs->size(),
-                        NumOuts));
-  int in_vec_size = 4;
-  int out_vec_size = 4;
-  if (NumOuts > 1) {
-    for (int i = 0; i < NumOuts; ++i) {
-      PADDLE_ENFORCE_EQ(
-          (*outs)[i]->dims(),
-          (*outs)[0]->dims(),
-          phi::errors::InvalidArgument(
-              "The shape of each output tensor shall be identical yet, but "
-              "%d-th output tensor`s shape is not.",
-              i));
-      out_vec_size = std::min(
-          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
-    }
-  } else {
-    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
-  }
+  PADDLE_ENFORCE_EQ(
+      ins.size(),
+      kArity,
+      phi::errors::InvalidArgument("The number of inputs is expected to be "
+                                   "equal to the "
+                                   "arity of functor. But recieved: the "
+                                   "number of inputs "
+                                   "is %d, the arity of functor is %d.",
+                                   ins.size(),
+                                   kArity));
+  PADDLE_ENFORCE_LE(
+      kArity,
+      3,
+      phi::errors::InvalidArgument("Currently only broadcast of ternary is "
+                                   "supported "
+                                   "and verified, but received %d.",
+                                   kArity));
+  PADDLE_ENFORCE_EQ(
+      outs->size(),
+      NumOuts,
+      phi::errors::InvalidArgument("Number of outputs shall equal to number "
+                                   "of functions, "
+                                   "but number of outputs is %d, of "
+                                   "functions is %d.",
+                                   outs->size(),
+                                   NumOuts));
+
+  // mergedim and get vec_size
+  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
+  phi::Array<kps::details::BroadcastConfig, kArity> configs;
 
-  for (auto *in : ins) {
-    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
-    in_vec_size = in->dims() == (*outs)[0]->dims()
-                      ? std::min(temp_size, in_vec_size)
-                      : in_vec_size;
+// get vec_size
+#ifdef PADDLE_WITH_XPU_KP
+  PADDLE_ENFORCE_EQ(
+      ins.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "XPU only support inputs is 2, but received %d", ins.size()));
+  configs[0] = kps::details::BroadcastConfig(merge_dims.out_dims,
+                                             merge_dims.in_dims[0],
+                                             merge_dims.in_dims[1],
+                                             merge_dims.dim_size);
+  configs[1] = kps::details::BroadcastConfig(merge_dims.out_dims,
+                                             merge_dims.in_dims[1],
+                                             merge_dims.in_dims[0],
+                                             merge_dims.dim_size);
+  auto type = kps::details::OptType::CanNotOptimize;
+  bool is_optimize = configs[0].cmp_type != type;
+  int vec_size = is_optimize ? VecSizeL : VecSizeM;
+#else
+  for (int i = 0; i < kArity; i++) {
+    // get the broadcast config,
+    // if data shape is[m, n], then you should set data_dim = {n, m}
+    // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
+    if (ins[i]->numel()) {
+      configs[i] = kps::details::BroadcastConfig(
+          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
+    }
   }
-  int vec_size = std::min(out_vec_size, in_vec_size);
+  int vec_size = GetVecsize<InT, OutT, NumOuts>(ins, outs);
+#endif
 
   switch (vec_size) {
-    case 4: {
-      BroadcastKernelForDifferentDimSize<InT,
-                                         OutT,
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         4>(ctx, ins, outs, axis, func);
+    case VecSizeL: {
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeL>(
+          ctx, ins, outs, func, configs);
       break;
     }
-    case 2: {
-      BroadcastKernelForDifferentDimSize<InT,
-                                         OutT,
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         2>(ctx, ins, outs, axis, func);
+    case VecSizeM: {
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeM>(
+          ctx, ins, outs, func, configs);
       break;
     }
-    case 1: {
-      BroadcastKernelForDifferentDimSize<InT,
-                                         OutT,
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         1>(ctx, ins, outs, axis, func);
+    case VecSizeS: {
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeS>(
+          ctx, ins, outs, func, configs);
       break;
     }
     default: {
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 332ec0b0312da..1093bdfa726c8 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -577,14 +577,16 @@ template <typename InT,
 struct ElementwisePrimitiveCaller {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result);
+                                    OutT *result,
+                                    int read_lens);
 };
 
 template <typename InT, typename OutT, int VecSize, typename Functor, int Arity>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(
         result, args, func);
   }
@@ -594,7 +596,8 @@ template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseConstant<InT, OutT, VecSize, 1, 1, Functor>(result, func);
   }
 };
@@ -603,7 +606,8 @@ template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(
         result, args[0], func);
   }
@@ -613,9 +617,10 @@ template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
-        result, args[0], args[1], func);
+        result, args[0], args[1], func, read_lens);
   }
 };
 
@@ -623,7 +628,8 @@ template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
         result, args[0], args[1], args[2], func);
   }
@@ -696,6 +702,42 @@ struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
   }
 };
 
+template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
+struct ElementwiseWriteDataCallerBc {
+  __device__ __forceinline__ void operator()(
+      phi::Array<_ptr_ OutT *, NumOuts> outs,
+      ConditionalT<OutT, NumOuts> src[VecSize],
+      int block_offset,
+      int num,
+      int read_lens) {
+    OutT dst[NumOuts][VecSize];
+#pragma unroll
+    for (int i = 0; i < read_lens; ++i) {
+#pragma unroll
+      for (int j = 0; j < NumOuts; ++j) {
+        dst[j][i] = (src[i])[j];
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < NumOuts; ++i) {
+      kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+          outs[i] + block_offset, dst[i], num, read_lens);
+    }
+  }
+};
+
+template <typename OutT, int VecSize, bool IsBoundary>
+struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
+  __device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
+                                             OutT src[VecSize],
+                                             int block_offset,
+                                             int num,
+                                             int read_lens) {
+    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+        outs[0] + block_offset, src, num, read_lens);
+  }
+};
+
 template <typename OutT,
           typename Functor,
           int Arity,
@@ -807,7 +849,7 @@ void ElementwiseKernel(const KPDevice &ctx,
                     kArity,
                     phi::errors::InvalidArgument(
                         "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
+                        "arity of functor. But received: the number of inputs "
                         "is %d, the arity of functor is %d.",
                         ins.size(),
                         kArity));
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 4c2b6ef896e71..70b11bc8c90b2 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -543,6 +543,13 @@ struct InverseModuloFunctor<
   }
 };
 
+template <typename T>
+struct ElementwiseHeavisideFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return a == static_cast<T>(0) ? b : static_cast<T>(a > 0);
+  }
+};
+
 template <typename T>
 struct FloorDivideFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const {
diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
index adbcd3b2b6207..b190bce474280 100644
--- a/paddle/phi/kernels/gpu/bce_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
@@ -38,7 +38,7 @@ struct BCELossFunctor {
   HOSTDEVICE inline T operator()(const T x, const T label) const {
     PADDLE_ENFORCE(
         (x >= static_cast<T>(0)) && (x <= one),
-        "Input is expected to be within the interval [0, 1], but recieved %f.",
+        "Input is expected to be within the interval [0, 1], but received %f.",
         x);
     T term1 = max(phi::kps::details::Log(x), neg_100);
     T term2 = max(phi::kps::details::Log(one - x), neg_100);
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index c814e7b3bb63d..3e7430fd84eaf 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -128,6 +128,16 @@ PD_REGISTER_KERNEL(minimum_grad,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(elementwise_heaviside_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(elementwise_pow_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
index 3a6ff365c11db..66a3f833d276a 100644
--- a/paddle/phi/kernels/gpu/linspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -18,7 +18,6 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/funcs/data_type_transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
@@ -42,6 +41,47 @@ __global__ void LinspaceSpecialKernel(T start, T* out) {
   out[0] = static_cast<T>(start);
 }
 
+template <typename T, typename Context>
+T GetValue(const Context& ctx, const DenseTensor& x) {
+  T value = static_cast<T>(0);
+  if (x.place() != CPUPlace()) {
+    DenseTensor cpu_x;
+    Copy(ctx, x, CPUPlace(), true, &cpu_x);
+    value = cpu_x.data<T>()[0];
+  } else {
+    value = x.data<T>()[0];
+  }
+  return value;
+}
+
+template <typename T, typename Context>
+T GetValueOfExpectedType(const Context& ctx, const DenseTensor& x) {
+  switch (x.dtype()) {
+    case DataType::FLOAT32:
+      return static_cast<T>(GetValue<float, Context>(ctx, x));
+    case DataType::FLOAT64:
+      return static_cast<T>(GetValue<double, Context>(ctx, x));
+    case DataType::INT32:
+      return static_cast<T>(GetValue<int32_t, Context>(ctx, x));
+    case DataType::INT64:
+      return static_cast<T>(GetValue<int64_t, Context>(ctx, x));
+    case DataType::FLOAT16:
+      return static_cast<T>(GetValue<phi::dtype::float16, Context>(ctx, x));
+    case DataType::BFLOAT16:
+      return static_cast<T>(GetValue<phi::dtype::bfloat16, Context>(ctx, x));
+    case DataType::BOOL:
+      return static_cast<T>(GetValue<bool, Context>(ctx, x));
+    case DataType::INT16:
+      return static_cast<T>(GetValue<int16_t, Context>(ctx, x));
+    case DataType::UINT8:
+      return static_cast<T>(GetValue<uint8_t, Context>(ctx, x));
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          x.dtype()));
+  }
+}
+
 template <typename T, typename Context>
 void LinspaceKernel(const Context& ctx,
                     const DenseTensor& start,
@@ -49,18 +89,9 @@ void LinspaceKernel(const Context& ctx,
                     const DenseTensor& number,
                     DataType dtype,
                     DenseTensor* out) {
-  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
-  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
-
-  DenseTensor n_start;
-  DenseTensor n_stop;
-  DenseTensor n_num;
-  phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start);
-  T start_data = n_start.data<T>()[0];
-  phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop);
-  T stop_data = n_stop.data<T>()[0];
-  phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num);
-  int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
+  T start_value = GetValueOfExpectedType<T, Context>(ctx, start);
+  T stop_value = GetValueOfExpectedType<T, Context>(ctx, stop);
+  int64_t num = GetValueOfExpectedType<int64_t, Context>(ctx, number);
 
   PADDLE_ENFORCE_GT(
       num,
@@ -72,16 +103,15 @@ void LinspaceKernel(const Context& ctx,
   out->Resize(phi::make_ddim({num}));
   T* out_data = ctx.template Alloc<T>(out);
 
-  double step = 0;
   auto stream = ctx.stream();
-  int block = 512;
-  int grid = (num + block - 1) / block;
   if (num != 1) {
-    step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    int block = 512;
+    int grid = (num + block - 1) / block;
+    double step = (static_cast<double>(stop_value - start_value)) / (num - 1);
     LinspaceKernelInner<T><<<grid, block, 0, stream>>>(
-        start_data, stop_data, step, num, out_data);
+        start_value, stop_value, step, num, out_data);
   } else {
-    LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start_data, out_data);
+    LinspaceSpecialKernel<T><<<1, 1, 0, stream>>>(start_value, out_data);
   }
 }
 
@@ -94,4 +124,8 @@ PD_REGISTER_KERNEL(linspace,
                    float,
                    int32_t,
                    int64_t,
-                   double) {}
+                   double) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index bd0143379ce15..2b087f8dcae09 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -148,14 +148,16 @@ void EinsumGradKernel(const Context& dev_ctx,
     right = splits[1].substr(1);
 
     auto equation_for_A =
-        right + "," + ops[1] + "->" + gather_labels_except_reduction(ops[0]);
+        ops[1] + "," + right + "->" + gather_labels_except_reduction(ops[0]);
     auto equation_for_B =
         right + "," + ops[0] + "->" + gather_labels_except_reduction(ops[1]);
     auto operands_for_A = std::vector<const DenseTensor*>();
     auto operands_for_B = std::vector<const DenseTensor*>();
     DenseTensor dA, dB;
-    operands_for_A.push_back(&out_grad);
+    // dA = einsum(B, dC)
     operands_for_A.push_back(x[1]);
+    operands_for_A.push_back(&out_grad);
+    // dB = einsum(dC, A)
     operands_for_B.push_back(&out_grad);
     operands_for_B.push_back(x[0]);
 
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 73940a45cbde2..901147734b29f 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include <set>
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
@@ -55,7 +56,8 @@ inline static void ValidationCheck(const std::string& equation) {
 enum LabelType {
   ALL_TYPE = 0,
   Batch = 1,    // ABO
-  Free,         // AO, BO
+  AO,           // AO --  free label
+  BO,           // BO --  free label
   Contraction,  // AB
   Reduction,    // A, B
 };
@@ -125,18 +127,32 @@ inline std::vector<char> union_labels(const std::vector<char>& a,
   return res;
 }
 
+// Apply transforms to all_labels and get another all_labels
+inline std::vector<char> TransformLabelsOrder(
+    const std::vector<char>& all_labels,
+    const LabelMap& type,
+    std::vector<LabelType> new_order) {
+  std::vector<char> ret;
+  for (auto cnt_type : new_order) {
+    std::vector<char> tmp;
+    for (int c : all_labels) {
+      if (type[c] == cnt_type) tmp.push_back(c);
+      std::sort(tmp.begin(), tmp.end());
+    }
+    ret.insert(ret.end(), tmp.begin(), tmp.end());
+  }
+  return ret;
+}
+
 inline static void GlobalInfo(const std::vector<std::string>& op_labels,
                               const std::string& right,
                               LabelMap* label2type,
                               std::vector<char>* sorted_labels) {
-  // sorted_labels: ['.', <right>, <left only label>]
-  VLOG(5) << "GlobalInfo: "
-          << paddle::string::join_strings(*sorted_labels, ",");
   std::vector<char> all;
   LabelMap counter(0);
   for (auto& ch : right) {  // char
     int c = ch;
-    (*label2type)[c] = LabelType::Free;
+    (*label2type)[c] = LabelType::BO;
   }
 
   for (auto& op : op_labels) {
@@ -146,39 +162,36 @@ inline static void GlobalInfo(const std::vector<std::string>& op_labels,
         all.push_back(ch);
       }
       counter[c] += 1;
-      if ((*label2type)[c] != LabelType::Free && counter[c] == 2)
+      if ((*label2type)[c] != LabelType::BO && counter[c] == 2)
         (*label2type)[c] = LabelType::Contraction;
       else if (counter[c] == 2)
         (*label2type)[c] = LabelType::Batch;
     }
   }
+
+  // BO is represent Free, so we need find the AO.
+  for (int c : op_labels[0]) {
+    if ((*label2type)[c] == LabelType::BO) (*label2type)[c] = LabelType::AO;
+  }
+
   (*label2type)['.'] = LabelType::Batch;
-  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
-    if ((*label2type)[c] == LabelType::Batch)
-      sorted_labels->push_back(static_cast<char>(c));
-  });
-  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
-    if ((*label2type)[c] == LabelType::Free)
-      sorted_labels->push_back(static_cast<char>(c));
-  });
-  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
-    if ((*label2type)[c] == LabelType::Contraction)
-      sorted_labels->push_back(static_cast<char>(c));
-  });
-  std::for_each(all.begin(), all.end(), [&sorted_labels, label2type](int c) {
-    if ((*label2type)[c] == LabelType::Reduction)
-      sorted_labels->push_back(static_cast<char>(c));
-  });
-  VLOG(5) << "GlobalInfo: sorted_labels before: "
-          << paddle::string::join_strings(*sorted_labels, ",");
+
+  *sorted_labels = TransformLabelsOrder(all,
+                                        *label2type,
+                                        {LabelType::Batch,
+                                         LabelType::AO,
+                                         LabelType::BO,
+                                         LabelType::Contraction,
+                                         LabelType::Reduction});
+
   if (counter[static_cast<int>('.')] > 0) {
     std::vector<char> tmp;
     tmp.push_back('.');
     // push '.' in the front
     *sorted_labels = union_labels(tmp, *sorted_labels);
-    VLOG(5) << "GlobalInfo: sorted_labels after: "
-            << paddle::string::join_strings(*sorted_labels, ",");
   }
+  VLOG(5) << "GlobalInfo: sorted_labels after: "
+          << paddle::string::join_strings(*sorted_labels, ",");
 }
 
 inline static void InferLabelShape(const std::vector<std::string>& op_labels,
@@ -289,17 +302,20 @@ inline static void ParseEinsumEquation(
   *right = results[1].substr(1);
   ReplaceEllipsis(*right);
   auto op_labels = paddle::string::split_string(left, ",");
+  // split_string("i,") -> ["i"], we expect 2 op_labels.
+  if (left[left.size() - 1] == ',') op_labels.push_back("");
   std::for_each(op_labels.begin(), op_labels.end(), ReplaceEllipsis);
   GlobalInfo(op_labels, *right, labeltype, all_labels);
   InferLabelShape(op_labels, inputs, labelshape, ellipsis_dims, broadcast_dims);
-  VLOG(5) << "Einsum Infershape: right:" << right;
-  VLOG(5) << "Einsum Infershape: op_labels:"
-          << paddle::string::join_strings(op_labels, "\n");
+  VLOG(5) << "Einsum Infershape: right:" << *right;
+  VLOG(5) << "Einsum Infershape: left :"
+          << paddle::string::join_strings(op_labels, '\n');
   InferOutputDims(*right, *broadcast_dims, *labelshape, output_dims);
   for (size_t i = 0; i < inputs.size(); ++i) {
     InferLabelPerm(
         op_labels[i], ellipsis_dims->at(i).size(), &((*label2perms)[i]));
   }
+  VLOG(5) << "Einsum Infershape: end";
 }
 
 template <typename T>
@@ -327,10 +343,12 @@ std::vector<T> GetShapeByType(const std::vector<char>& all_labels,
                               const LabelMap& perm,
                               const LabelMap& label2shape,
                               const std::vector<int>& ellipsis,
-                              LabelType filter) {
+                              std::set<LabelType> filter) {
   std::vector<T> res;
   for (T c : all_labels) {
-    if ((filter == LabelType::ALL_TYPE || type[c] == filter) && perm[c] != -1) {
+    if ((filter.count(LabelType::ALL_TYPE) ||
+         filter.count(LabelType(type[c]))) &&
+        perm[c] != -1) {
       if (c == '.')
         res.insert(res.end(), ellipsis.begin(), ellipsis.end());
       else
@@ -390,7 +408,8 @@ DenseTensor PerformContraction(
     const LabelMap& label2type,
     const LabelMap& label2shape,
     const std::vector<std::vector<int>>& ellipsis_dims,
-    const std::vector<int>& broadcast_dims) {
+    const std::vector<int>& broadcast_dims,
+    std::vector<DenseTensor*> cache) {
   // Get All the Batches, so perm is
   auto all_valid = LabelMap(1);
   auto recover_dim = GetShapeByType<int>(all_labels,
@@ -398,36 +417,74 @@ DenseTensor PerformContraction(
                                          all_valid,
                                          label2shape,
                                          broadcast_dims,
-                                         LabelType::Batch);
+                                         {LabelType::Batch});
   auto preprocess = [&](const DenseTensor& t,
                         const LabelMap& perm,
-                        const std::vector<int>& ellipsis) -> DenseTensor {
-    auto frees = GetShapeByType<int>(
-        all_labels, label2type, perm, label2shape, ellipsis, LabelType::Free);
+                        const std::vector<int>& ellipsis,
+                        int operand_idx) -> DenseTensor {
+    // reshape
+    auto frees = GetShapeByType<int>(all_labels,
+                                     label2type,
+                                     perm,
+                                     label2shape,
+                                     ellipsis,
+                                     {LabelType::AO, LabelType::BO});
     auto conts = GetShapeByType<int>(all_labels,
                                      label2type,
                                      perm,
                                      label2shape,
                                      ellipsis,
-                                     LabelType::Contraction);
-    auto trans_t = PerformTranspose<T, Context>(
-        dev_ctx, t, perm, all_labels, ellipsis, label2type);
-    auto mul_dims = GetShapeByType<int>(
-        all_labels, label2type, perm, label2shape, ellipsis, LabelType::Batch);
+                                     {LabelType::Contraction});
+    std::vector<char> reordered_all_labels = all_labels;
+    if (operand_idx == 1) {
+      reordered_all_labels = TransformLabelsOrder(all_labels,
+                                                  label2type,
+                                                  {LabelType::Batch,
+                                                   LabelType::Contraction,
+                                                   LabelType::AO,
+                                                   LabelType::BO,
+                                                   LabelType::Reduction});
+    }
+    // reduction
+    DenseTensor trans_t;
+    if (cache[operand_idx]->IsInitialized()) {
+      trans_t.ShareBufferWith(*(cache[operand_idx]));
+    } else {
+      auto reduct_t = PerformReduction<T, Context>(
+          dev_ctx, t, perm, all_labels, ellipsis, label2type);
+      trans_t = PerformTranspose<T, Context>(
+          dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
+      cache[operand_idx]->ShareBufferWith(trans_t);
+    }
+    auto mul_dims = GetShapeByType<int>(all_labels,
+                                        label2type,
+                                        perm,
+                                        label2shape,
+                                        ellipsis,
+                                        {LabelType::Batch});
     recover_dim.insert(recover_dim.end(), frees.begin(), frees.end());
-    mul_dims.push_back(
-        std::accumulate(frees.begin(), frees.end(), 1, std::multiplies<int>()));
-    mul_dims.push_back(
-        std::accumulate(conts.begin(), conts.end(), 1, std::multiplies<int>()));
+    if (operand_idx == 0) {
+      mul_dims.push_back(std::accumulate(
+          frees.begin(), frees.end(), 1, std::multiplies<int>()));
+      mul_dims.push_back(std::accumulate(
+          conts.begin(), conts.end(), 1, std::multiplies<int>()));
+    } else {
+      mul_dims.push_back(std::accumulate(
+          conts.begin(), conts.end(), 1, std::multiplies<int>()));
+      mul_dims.push_back(std::accumulate(
+          frees.begin(), frees.end(), 1, std::multiplies<int>()));
+    }
     VLOG(5) << "PerformContraction: mul_dims: "
             << paddle::string::join_strings(mul_dims, ",");
     trans_t.Resize(make_ddim(mul_dims));
     return trans_t;
   };
-  auto trans_a = preprocess(A, label2perm[0], ellipsis_dims[0]);
-  auto trans_b = preprocess(B, label2perm[1], ellipsis_dims[1]);
+
+  // Reduction, Reshape and Matmul
+  auto trans_a = preprocess(A, label2perm[0], ellipsis_dims[0], 0);
+  auto trans_b = preprocess(B, label2perm[1], ellipsis_dims[1], 1);
   auto after_contraction =
-      Matmul<T, Context>(dev_ctx, trans_a, trans_b, false, true);
+      Matmul<T, Context>(dev_ctx, trans_a, trans_b, false, false);
   VLOG(5) << "PerformContraction: recover_dim: "
           << paddle::string::join_strings(recover_dim, ",");
   after_contraction.Resize(make_ddim(recover_dim));
@@ -465,10 +522,11 @@ void TransposeToOutput(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void EinsumKernel(const Context& dev_ctx,
-                  const std::vector<const DenseTensor*>& inputs,
-                  const std::string& equation,
-                  DenseTensor* out) {
+void EinsumKernelImpl(const Context& dev_ctx,
+                      const std::vector<const DenseTensor*>& inputs,
+                      const std::string& equation,
+                      DenseTensor* out,
+                      std::vector<DenseTensor*> cache) {
   ValidationCheck(equation);
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
@@ -498,22 +556,18 @@ void EinsumKernel(const Context& dev_ctx,
   if (inputs.size() == 2) {
     auto& A = inputs[0];
     auto& B = inputs[1];
-    // Reduce Procedure
-    auto reduce_A = PerformReduction<T, Context>(
-        dev_ctx, *A, label2perms[0], all_labels, ellipsis_dims[0], labeltype);
-    auto reduce_B = PerformReduction<T, Context>(
-        dev_ctx, *B, label2perms[1], all_labels, ellipsis_dims[1], labeltype);
-    // Contract Procedure
+    // Reduction and Contract Procedure
     dev_ctx.template Alloc<T>(out);
     auto after_contraction = PerformContraction<T, Context>(dev_ctx,
-                                                            reduce_A,
-                                                            reduce_B,
+                                                            *A,
+                                                            *B,
                                                             label2perms,
                                                             all_labels,
                                                             labeltype,
                                                             labelshape,
                                                             ellipsis_dims,
-                                                            broadcast_dims);
+                                                            broadcast_dims,
+                                                            cache);
     TransposeToOutput<T, Context>(dev_ctx,
                                   after_contraction,
                                   right,
@@ -545,4 +599,18 @@ void EinsumKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void EinsumKernel(const Context& dev_ctx,
+                  const std::vector<const DenseTensor*>& inputs,
+                  const std::string& equation,
+                  DenseTensor* out) {
+  std::vector<DenseTensor> cache(inputs.size());  // set empty; TA, TB, TdC
+  std::vector<DenseTensor*> cache_tensor(
+      inputs.size());  // set empty; TA, TB, TdC
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    cache_tensor[i] = &cache[i];
+  }
+  EinsumKernelImpl<T, Context>(dev_ctx, inputs, equation, out, cache_tensor);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index fa1f15672b903..3c06b238d145c 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -513,6 +513,20 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx,
                                         funcs::InverseMultiplyFunctor<T>>(
           dev_ctx, dout, ddy_safe, dx, axis);
     }
+  } else {
+    if (dx && dy) {
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+    }
   }
 }
 
@@ -683,6 +697,43 @@ struct MinGradDy {
   }
 };
 
+template <typename T>
+struct HeavisideGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(0);
+  }
+};
+
+template <typename T>
+struct HeavisideGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(x == static_cast<T>(0));
+  }
+};
+
+template <typename T, typename Context>
+void ElementwiseHeavisideGradKernel(const Context& dev_ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& y,
+                                    const DenseTensor& dout,
+                                    int axis,
+                                    DenseTensor* dx,
+                                    DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::
+      ElemwiseGradCompute<Context, T, HeavisideGradDx<T>, HeavisideGradDy<T>>(
+          dev_ctx,
+          x,
+          y,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          HeavisideGradDx<T>(),
+          HeavisideGradDy<T>());
+}
+
 template <typename T>
 struct PowGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
diff --git a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
index 82bd9fba2a66d..e3cd6f5828d04 100644
--- a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
+++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -158,7 +158,7 @@ static void VisitDataType(DataType type, Visitor visitor) {
     visitor.template apply<int64_t>();
   } else {
     PADDLE_THROW(errors::InvalidArgument(
-        "The recieved values data type %s can not meet input requirements. "
+        "The received values data type %s can not meet input requirements. "
         "Because the given values data type of searchsorted operators must be "
         "float32, float64, int32 or int64. Please input appropriate "
         "sorted_sequence again! ",
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 821fda52ab102..d387096a70b75 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -54,6 +54,8 @@ void FloorDivideKernel(const Context& dev_ctx,
   int axis = -1;
   FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
 }
+// Create the definition of Heaviside
+DEFINE_CUDA_ELEMENTWISE_OP(ElementwiseHeaviside)
 // Create the definition of Pow
 DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 template <typename T, typename Context>
@@ -130,6 +132,14 @@ PD_REGISTER_KERNEL(floor_divide_raw,
                    phi::FloorDivideRawKernel,
                    int,
                    int64_t) {}
+PD_REGISTER_KERNEL(elementwise_heaviside_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow_raw,
                    KPS,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index e02f4450a8bab..fabc6c0d13e7c 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -271,6 +271,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
   }
 }
 
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseBinary(
+    OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx]));
+  }
+}
+
 /**
  * @brief Ternary calculation according to OpFunc. Shape of input and output
  * are the same.
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index 0e77b11988e76..eb45def836edc 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -17,6 +17,7 @@
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
+#include "xpu/kernel/simd_header.h"
 
 namespace phi {
 namespace kps {
@@ -158,6 +159,19 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
   }
 }
 
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseBinary(
+    OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) {
+  for (int idx = 0; idx < read_lens; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx]));
+  }
+}
+
 /**
  * @brief Ternary calculation according to OpFunc. Shape of input and output
  * are the same.
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 993349f2d9e14..8b0c42c9d19b1 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -82,10 +82,10 @@ struct FastDivMod {
  * index of the output data. if input or output shape is [dim0, dim1] then dims
  * must be [dim1, dim0].
  */
-template <int kDims>
 struct BroadcastConfig {
-  FastDivMod divmoders[kDims];
+  FastDivMod divmoders[phi::DDim::kMaxRank];
   uint32_t strides[phi::DDim::kMaxRank];
+  int kDims;
   HOSTDEVICE BroadcastConfig() {}
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
@@ -109,7 +109,7 @@ struct BroadcastConfig {
                                             std::multiplies<int64_t>())
                           : strides_in[i];
     }
-
+    kDims = dim_size;
     memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t));
     memcpy(divmoders, divmoders_in.data(), kDims * sizeof(FastDivMod));
   }
@@ -246,6 +246,14 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
   }
 }
 
+template <typename T, int NX>
+__device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    dst[i] = init_data;
+  }
+}
+
 /**
  * The difference from the above function is that
  * it supports different data types of inputs.
@@ -311,6 +319,38 @@ __device__ __forceinline__ void ReadData(T* dst,
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(T* dst,
+                                         const T* __restrict__ src,
+                                         int num,
+                                         int read_lens) {
+  if (IsBoundary) {  // blockDim.x * NX > num
+    int thread_offset = threadIdx.x * NX;
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        dst[idx] = src[thread_offset + idx];
+      }
+    }
+  } else {  // blockDim,x * NX < num
+    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+    constexpr int kVectorsPerThread = NX / kVectorSize;
+    int thread_offset = threadIdx.x * kVectorsPerThread;
+
+    using VecType = details::VectorType<T, kVectorSize>;
+    const VecType* vec_input = reinterpret_cast<const VecType*>(src);
+    VecType vec_temp[kVectorsPerThread];
+
+#pragma unroll
+    for (int i = 0; i < kVectorsPerThread; ++i) {
+      vec_temp[i] = vec_input[thread_offset + i];
+#pragma unroll
+      for (int idx = 0; idx < NX; ++idx) {
+        dst[idx] = *(reinterpret_cast<T*>(vec_temp) + idx);
+      }
+    }
+  }
+}
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
@@ -396,17 +436,12 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
     T* dst,
     const T* __restrict__ src,
     uint32_t block_offset,
-    details::BroadcastConfig<Rank> config,
+    const details::BroadcastConfig& config,
     int total_num_output,
     int stride_nx,
     int stride_ny) {
@@ -425,7 +460,8 @@ __device__ __forceinline__ void ReadDataBc(
         }
       }
 #pragma unroll
-      for (int i = 0; i < Rank; ++i) {
+      for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+        if (i >= config.kDims) break;
         auto fast_divmoder = config.divmoders[i].Divmod(index_output);
         index_output = fast_divmoder.val[0];
         index_src += fast_divmoder.val[1] * config.strides[i];
@@ -576,6 +612,36 @@ __device__ __forceinline__ void WriteData(T* dst,
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
+                                          int num,
+                                          int read_lens) {
+  if (IsBoundary) {
+    int thread_offset = threadIdx.x * NX;
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if ((thread_offset + idx) < num) {
+        dst[thread_offset + idx] = src[idx];
+      }
+    }
+  } else {
+    // Vector type
+    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+    constexpr int kVectorsPerThread = NX / kVectorSize;
+
+    int thread_offset = threadIdx.x * kVectorsPerThread;
+    using VecType = details::VectorType<T, kVectorSize>;
+    VecType* vec_dst = reinterpret_cast<VecType*>(dst);
+    VecType vec_temp[kVectorsPerThread];
+#pragma unroll
+    for (int idx = 0; idx < kVectorsPerThread; ++idx) {
+      vec_temp[idx] = *(reinterpret_cast<VecType*>(src) + idx);
+      vec_dst[thread_offset + idx] = vec_temp[idx];
+    }
+  }
+}
+
 /**
  * @brief Write 2D data from register to global memory according to Tx type, and
  * store it as Ty type.
@@ -715,18 +781,14 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
  * coordinate mapping relationship between output data and input data.
  * total_num_output: Total number of original output.
  */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
     T* dst,
     const T* __restrict__ src,
     uint32_t block_offset,
-    details::BroadcastConfig<Rank> config,
-    int total_num_output) {
+    const details::BroadcastConfig& config,
+    int total_num_output,
+    int read_lens = NX) {
   uint32_t thread_offset = block_offset + threadIdx.x * NX;
   uint32_t index_src = 0;
 
@@ -740,7 +802,8 @@ __device__ __forceinline__ void ReadDataBc(
       }
     }
 #pragma unroll
-    for (int i = 0; i < Rank; ++i) {
+    for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+      if (i >= config.kDims) break;
       auto fast_divmoder = config.divmoders[i].Divmod(index_output);
       index_output = fast_divmoder.val[0];
       index_src += fast_divmoder.val[1] * config.strides[i];
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index a18fc7cbb3119..3799b9d4892f8 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -21,6 +21,39 @@ namespace phi {
 namespace kps {
 namespace details {
 
+enum class OptType {    // Optimize type of calc after input shape compressed
+  CanNotOptimize = -1,  // can not optimize, broadcast first
+  N_1,                  // just like {1} op {100} or {100} op {1}
+  MN_N,                 // just like {100} op {3, 100} or {3, 100} op {100}
+  MN_M,                 // just like {3} op {3, 100} or {3, 100} op {3}
+  MNK_1N1,              // just like {3} op {2, 3, 100} or {2, 3, 100} op {3}
+  MNK_M1K,  // just like {2, 1, 100} op {2, 3, 100} or {2, 3, 100} op {2, 1,
+            // 100}
+};
+
+// Rules to determine whether dimensions can be merged
+// rule 0 - xshape[idx] == yshape[idx]
+// rule 1 - xshape[idx] == 1 && yshape[idx] != 1
+// rule 2 - xshape[idx] != 1 && yshape[idx] == 1
+static int judge_case(int a, int b) {
+  if (a == b) {
+    return 0;
+  } else if (a == 1 && b != 1) {
+    return 1;
+  } else if (a != 1 && b == 1) {
+    return 2;
+  }
+  return -1;
+}
+
+static bool case_is_same(int case_front, int case_back) {
+  if (case_front == case_back) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 template <typename T, int VecSize>
 struct alignas(sizeof(T) * VecSize) VectorType {
   T val[VecSize];
@@ -32,16 +65,24 @@ struct alignas(sizeof(T) * VecSize) VectorType {
  * must be [dim1, dim0].
  */
 #pragma pack(4)
-template <int kDims>
 struct BroadcastConfig {
   int strides_in[phi::DDim::kMaxRank];
   int strides_out[phi::DDim::kMaxRank];
   int in_dim[phi::DDim::kMaxRank];
-
+  int dim_after_cmp[phi::DDim::kMaxRank];
+  int dim_size_after_cmp = 0;
+  int cmp_res = 0;
+  OptType cmp_type = OptType::CanNotOptimize;
+  int m = 1;
+  int n = 1;
+  int k = 1;
+  int buf_len = 0;
+  int kDims;
   HOSTDEVICE BroadcastConfig() {}
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
                              const std::vector<int64_t>& in_dims,
+                             const std::vector<int64_t>& another_in_dims,
                              int dim_size) {
     std::vector<int> strides_in_tmp;
     std::vector<int> strides_out_tmp;
@@ -57,22 +98,191 @@ struct BroadcastConfig {
     for (int i = 0; i < dim_size; i++) {
       dim_tmp[i] = in_dims[i];
     }
-
+    kDims = dim_size;
     memcpy(strides_in, strides_in_tmp.data(), kDims * sizeof(int));
     memcpy(strides_out, strides_out_tmp.data(), kDims * sizeof(int));
     memcpy(in_dim, dim_tmp.data(), kDims * sizeof(int));
+
+    cmp_res = get_mnk_for_broadcast_ops(in_dims, another_in_dims);
+    get_opt_type(another_in_dims);
+    buf_len = get_buf_len();
+  }
+
+  int get_buf_len() {
+    if (cmp_type == OptType::CanNotOptimize) {
+      return 256;
+    }
+    int max_buf_len = 512;
+    int buf_len = m / 16 * 16;
+    if (buf_len == 0) {
+      buf_len = m;
+    }
+    return std::min(max_buf_len, buf_len);
   }
 
   __device__ inline int operator()(int index_output) const {
     int index_src = 0;
-#pragma unroll
-    for (int i = kDims - 1; i >= 0; --i) {
-      int tmp_index = (index_output / strides_out[i]);
-      index_output = index_output - tmp_index * strides_out[i];
-      index_src += (tmp_index % in_dim[i]) * strides_in[i];
+
+    switch (cmp_type) {
+      int div, mod, tmp_index;
+      case OptType::MNK_M1K:
+        div = index_output / (m * n);
+        mod = index_output % (m * n) % m;
+        index_src = div * m + mod;
+        break;
+      case OptType::MNK_1N1:
+        // index_src = index_output / m % n;
+        index_src = index_output % (m * n) / m;
+        break;
+      case OptType::N_1:
+        index_src = 0;
+        break;
+      case OptType::MN_N:
+        index_src = index_output / m;
+        break;
+      case OptType::MN_M:
+        index_src = index_output % m;
+        break;
+      case OptType::CanNotOptimize:
+        for (int i = kDims - 1; i >= 0; --i) {
+          tmp_index = (index_output / strides_out[i]);
+          index_output = index_output - tmp_index * strides_out[i];
+          index_src += (tmp_index % in_dim[i]) * strides_in[i];
+        }
+        break;
     }
     return index_src;
   }
+
+  void get_opt_type(const std::vector<int64_t>& y_dim_after_cmp) {
+    if (dim_size_after_cmp == 1) {
+      if (dim_after_cmp[0] == 1 && y_dim_after_cmp[0] != 1) {  // {1} op {n}
+        n = y_dim_after_cmp[0];
+        cmp_type = OptType::N_1;
+      } else if (dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[0] == 1) {  // {n} op {1}
+        n = dim_after_cmp[0];
+        cmp_type = OptType::N_1;
+      } else {
+        cmp_type = OptType::CanNotOptimize;  // xshape == yshape
+      }
+    }
+    if (dim_size_after_cmp == 2) {
+      if (dim_after_cmp[0] == 1 && dim_after_cmp[1] != 1 &&
+          y_dim_after_cmp[0] != 1 &&
+          y_dim_after_cmp[1] != 1) {  // {n} op {m, n}
+        m = y_dim_after_cmp[0];
+        n = y_dim_after_cmp[1];
+        cmp_type = OptType::MN_N;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] == 1 &&
+                 y_dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[1] != 1) {  // {m} op {m, n}
+        m = y_dim_after_cmp[0];
+        n = y_dim_after_cmp[1];
+        cmp_type = OptType::MN_M;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 &&
+                 y_dim_after_cmp[0] == 1 &&
+                 y_dim_after_cmp[1] != 1) {  // {m, n} op {n}
+        m = dim_after_cmp[0];
+        n = dim_after_cmp[1];
+        cmp_type = OptType::MN_N;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 &&
+                 y_dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[1] == 1) {  // {m, n} op {m}
+        m = dim_after_cmp[0];
+        n = dim_after_cmp[1];
+        cmp_type = OptType::MN_M;
+      } else {
+        cmp_type = OptType::CanNotOptimize;
+      }
+    }
+    if (dim_size_after_cmp == 3) {
+      if (dim_after_cmp[0] == 1 && dim_after_cmp[1] != 1 &&
+          dim_after_cmp[2] == 1 && y_dim_after_cmp[0] != 1 &&
+          y_dim_after_cmp[1] != 1 &&
+          y_dim_after_cmp[2] != 1) {  // {1, n, 1} op {m, n, k}
+        m = y_dim_after_cmp[0];
+        n = y_dim_after_cmp[1];
+        k = y_dim_after_cmp[2];
+        cmp_type = OptType::MNK_1N1;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 &&
+                 dim_after_cmp[2] != 1 && y_dim_after_cmp[0] == 1 &&
+                 y_dim_after_cmp[1] != 1 &&
+                 y_dim_after_cmp[2] == 1) {  // {m, n, k} op {1, n, 1}
+        m = dim_after_cmp[0];
+        n = dim_after_cmp[1];
+        k = dim_after_cmp[2];
+        cmp_type = OptType::MNK_1N1;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] == 1 &&
+                 dim_after_cmp[2] != 1 && y_dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[1] != 1 &&
+                 y_dim_after_cmp[2] != 1) {  // {m, 1, k} op {m, n, k}
+        m = y_dim_after_cmp[0];
+        n = y_dim_after_cmp[1];
+        k = y_dim_after_cmp[2];
+        cmp_type = OptType::MNK_M1K;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 &&
+                 dim_after_cmp[2] != 1 && y_dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[1] == 1 &&
+                 y_dim_after_cmp[2] != 1) {  // {m, n, k} op {m, 1, k}
+        m = dim_after_cmp[0];
+        n = dim_after_cmp[1];
+        k = dim_after_cmp[2];
+        cmp_type = OptType::MNK_M1K;
+      } else {
+        cmp_type = OptType::CanNotOptimize;
+      }
+    }
+  }
+
+  int get_mnk_for_broadcast_ops(const std::vector<int64_t>& xshape,
+                                const std::vector<int64_t>& yshape) {
+    int idx = 0;
+    int cmp_x = 0;
+    int cmp_y = 0;
+    bool is_same = false;
+    std::vector<int64_t> xshape_after_remove_ones = xshape;
+    std::vector<int64_t> yshape_after_remove_ones = yshape;
+    // first step: remove excess ones
+    std::vector<int64_t>::iterator x_iter = xshape_after_remove_ones.begin();
+    std::vector<int64_t>::iterator y_iter = yshape_after_remove_ones.begin();
+    for (; x_iter != xshape_after_remove_ones.end();) {
+      if (*x_iter == 1 && *y_iter == 1) {
+        x_iter = xshape_after_remove_ones.erase(x_iter);
+        y_iter = yshape_after_remove_ones.erase(y_iter);
+      } else {
+        x_iter++;
+        y_iter++;
+      }
+    }
+    // second step: compress dims
+    int after_cmp_idx = 0;
+    for (int i = 0; i < 3; i++) {
+      cmp_x = xshape_after_remove_ones[idx];
+      cmp_y = yshape_after_remove_ones[idx];
+      while ((idx + 1) < xshape_after_remove_ones.size()) {
+        is_same = case_is_same(judge_case(xshape_after_remove_ones[idx],
+                                          yshape_after_remove_ones[idx]),
+                               judge_case(xshape_after_remove_ones[idx + 1],
+                                          yshape_after_remove_ones[idx + 1]));
+        if (is_same) {
+          cmp_x = cmp_x * xshape_after_remove_ones[idx + 1];
+          cmp_y = cmp_y * yshape_after_remove_ones[idx + 1];
+          idx++;
+        } else {
+          break;
+        }
+      }
+      idx = idx + 1;
+      dim_after_cmp[after_cmp_idx] = cmp_x;
+      after_cmp_idx++;
+      if (idx == xshape_after_remove_ones.size()) {
+        dim_size_after_cmp = after_cmp_idx;
+        return 0;
+      }
+    }
+    return -1;  // can not compress dims
+  }
 };
 #pragma pack()
 
@@ -199,6 +409,14 @@ __device__ __inline__ void Init(T* dst, T init_data) {
   }
 }
 
+template <typename T, int NX>
+__device__ __inline__ void Init(T* dst, T init_data, int read_lens) {
+#pragma unroll
+  for (int i = 0; i < read_lens; i++) {
+    dst[i] = init_data;
+  }
+}
+
 /**
  * The difference from the above function is that
  * it supports different data types of inputs.
@@ -251,6 +469,26 @@ __device__ __inline__ void ReadData(T* dst,
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+__device__ __inline__ void ReadData(T* dst,
+                                    const T _global_ptr_* src,
+                                    int num,
+                                    int read_lens) {
+  int thread_offset = core_id() * read_lens;
+  __local__ T in_temp[1];
+  if (IsBoundary) {  // core_num() * read_lens > num
+#pragma unroll
+    for (int idx = 0; idx < read_lens; ++idx) {
+      if (idx + thread_offset < num) {
+        GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
+        dst[idx] = in_temp[0];
+      }
+    }
+  } else {  // core_num() * read_lens < num
+    GM2LM(src + thread_offset, dst, read_lens * sizeof(T));
+  }
+}
+
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
@@ -312,7 +550,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
  * NY: The number of data rows loaded by each thread.
  * BlockSize: Identifies the current device thread index method. For xpu,
  * core_id() is used as the index.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
  * IsBoundary: Indicates whether to perform block access storage out-of-bounds
  * judgment. When the number of data processed by the block is less than
  * NX x NY x core_num(), boundary judgment is required to avoid memory access
@@ -328,16 +565,11 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __inline__ void ReadDataBc(T* dst,
                                       const T _global_ptr_* src,
                                       uint32_t block_offset,
-                                      details::BroadcastConfig<Rank> config,
+                                      const details::BroadcastConfig& config,
                                       int total_num_output,
                                       int stride_nx,
                                       int stride_ny) {
@@ -479,10 +711,32 @@ __device__ __forceinline__ void ReadDataReduce(
  * size: The current block needs to load size elements continuously.
  */
 
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+__device__ void WriteData(T _global_ptr_* dst,
+                          const T* src,
+                          int num,
+                          int read_lens) {
+  int thread_offset = core_id() * read_lens;
+  __local__ T in_temp[1];
+
+  if (IsBoundary) {  // core_num() * read_lens > num
+#pragma unroll
+    for (int idx = 0; idx < read_lens; ++idx) {
+      if (idx + thread_offset < num) {
+        in_temp[0] = src[idx];
+        LM2GM(in_temp, dst + idx + thread_offset, sizeof(T));
+      }
+    }
+  } else {  // core_num() * read_lens < num
+    LM2GM(src, dst + thread_offset, read_lens * sizeof(T));
+  }
+}
+
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
 __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
+
   if (IsBoundary) {  // core_num() * NX > num
 #pragma unroll
     for (int idx = 0; idx < NX; ++idx) {
@@ -621,6 +875,272 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
   }
 }
 
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {m, 1, k}-> {m, n, k} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBcM1kMnk(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  int m = config.m;
+  int n = config.n;
+
+  int m_pos = index_base % m;
+  if ((m - m_pos) < read_lens) {
+    int last_col = m - m_pos;
+    GM2LM(src + index_base, dst, last_col * sizeof(T));
+    int n_pos = index_output % (m * n) / m;
+    int next_part_index = 0;
+    if (n_pos != config.n - 1) {
+      next_part_index = index_base / m * m;
+    } else {
+      next_part_index = (index_base / m + 1) * m;
+    }
+    GM2LM(src + next_part_index,
+          dst + last_col,
+          (read_lens - last_col) * sizeof(T));
+  } else {
+    GM2LM(src + index_base, dst, read_lens * sizeof(T));
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {m, 1}-> {m, n} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBcM1Mn(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  int m = config.m;
+  int n = config.n;
+
+  int m_pos = index_base % m;
+  if ((m - m_pos) < read_lens) {
+    int last_col = m - m_pos;
+    GM2LM(src + index_base, dst, last_col * sizeof(T));
+    GM2LM(src, dst + last_col, (read_lens - last_col) * sizeof(T));
+  } else {
+    GM2LM(src + index_base, dst, read_lens * sizeof(T));
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {1, n}-> {m, n} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBc1NMn(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  int m = config.m;
+  int n = config.n;
+  T in_temp;
+
+  int m_pos = index_output % m;
+  if ((m - m_pos) < read_lens) {
+    int last_col = m - m_pos;
+    GM2LM(src + index_base, &in_temp, sizeof(T));
+    for (int i = 0; i < last_col; i++) {
+      dst[i] = in_temp;
+    }
+    GM2LM(src + index_base + 1, &in_temp, sizeof(T));
+    for (int i = 0; i < read_lens - last_col; i++) {
+      dst[last_col + i] = in_temp;
+    }
+  } else {
+    GM2LM(src + index_base, &in_temp, sizeof(T));
+    for (int i = 0; i < read_lens; i++) {
+      dst[i] = in_temp;
+    }
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {1, n, 1}-> {m, n, k} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBc1N1Mnk(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  int m = config.m;
+  int n = config.n;
+  T in_temp;
+
+  int m_pos = index_output % m;
+  if ((m - m_pos) < read_lens) {
+    int last_col = m - m_pos;
+    GM2LM(src + index_base, &in_temp, sizeof(T));
+    for (int i = 0; i < last_col; i++) {
+      dst[i] = in_temp;
+    }
+    int n_pos = index_output % (m * n) / m;
+    int next_part_index = 0;
+    if (n_pos != n - 1) {
+      next_part_index = n_pos + 1;
+    } else {
+      next_part_index = 0;
+    }
+    GM2LM(src + next_part_index, &in_temp, sizeof(T));
+    for (int i = 0; i < read_lens - last_col; i++) {
+      dst[last_col + i] = in_temp;
+    }
+  } else {
+    GM2LM(src + index_base, &in_temp, sizeof(T));
+    for (int i = 0; i < read_lens; i++) {
+      dst[i] = in_temp;
+    }
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {1}-> {n} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBc1N(T* dst,
+                                        const T _global_ptr_* src,
+                                        int thread_offset,
+                                        const details::BroadcastConfig& config,
+                                        int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  T in_temp;
+
+  GM2LM(src + index_base, &in_temp, sizeof(T));
+  for (int i = 0; i < read_lens; i++) {
+    dst[i] = in_temp;
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * form which can not compress.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * total_num_output: Total number of original output.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T, bool IsBoundary = false>
+__device__ __inline__ void ReadDataBcCanNotCmp(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int total_num_output,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  T in_temp;
+  int cache_size = 256;
+  __local__ T src_temp[cache_size];
+  GM2LM(src + index_base, src_temp, cache_size * sizeof(T));
+
+  for (int nx = 0; nx < read_lens; ++nx) {
+    index_output = thread_offset + nx;
+    if (IsBoundary) {
+      if (index_output >= total_num_output) {
+        break;
+      }
+    }
+    int index_src = config(index_output);
+    if (index_src >= index_base && index_src < index_base + cache_size) {
+      in_temp = src_temp[index_src - index_base];
+    } else {
+      GM2LM(src + index_src, &in_temp, sizeof(T));
+    }
+    dst[nx] = in_temp;
+  }
+}
+
 /**
  * @brief Read 1D data from global memory to register with broadcast form.
  *
@@ -630,7 +1150,6 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
  * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
  * BlockSize: Identifies the current device thread index method. For xpu,
  * core_id() is used as the index.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
  * IsBoundary: Indicates whether to perform block access storage out-of-bounds
  * judgment. When the number of data processed by the block is less than
  * NX x NY x core_num(), boundary judgment is required to avoid memory access
@@ -642,36 +1161,31 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
  * block_offset: The data offset of this block, core_num() * blockIdx.x * NX;
  * config: Calculation configuration of broadcast. It is used to calculate the
  * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
  * total_num_output: Total number of original output.
  */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(
-    T* dst,
-    const T _global_ptr_* src,
-    uint32_t block_offset,
-    const details::BroadcastConfig<Rank>& config,
-    int total_num_output) {
-  int thread_offset = block_offset + core_id() * NX;
-  int index_src = 0;
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+__device__ __inline__ void ReadDataBc(T* dst,
+                                      const T _global_ptr_* src,
+                                      uint32_t block_offset,
+                                      const details::BroadcastConfig& config,
+                                      int total_num_output,
+                                      int read_lens) {
+  int thread_offset = block_offset + core_id() * read_lens;
 
-  __local__ T in_temp;
-#pragma unroll
-  for (int nx = 0; nx < NX; ++nx) {
-    int index_output = thread_offset + nx;
-    index_src = 0;
-    if (IsBoundary) {
-      if (index_output >= total_num_output) {
-        break;
-      }
-    }
-    index_src = config(index_output);
-    GM2LM(src + index_src, &in_temp, sizeof(T));
-    dst[nx] = in_temp;
+  if (config.cmp_type == details::OptType::MNK_M1K) {
+    ReadDataBcM1kMnk<T>(dst, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::N_1) {
+    ReadDataBc1N<T>(dst, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MN_M) {
+    ReadDataBcM1Mn<T>(dst, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MN_N) {
+    ReadDataBc1NMn<T>(dst, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MNK_1N1) {
+    ReadDataBc1N1Mnk<T>(dst, src, thread_offset, config, read_lens);
+  } else {
+    ReadDataBcCanNotCmp<T, IsBoundary>(
+        dst, src, thread_offset, config, total_num_output, read_lens);
   }
 }
 
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index b5a1e88acc32b..f68a046ae077a 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -40,12 +40,15 @@
 #define GRID_NUM_X cluster_num()
 #define GRID_NUM_Y 0
 #define GRID_NUM_Z 0
-
+#define VecSizeL 512
+#define VecSizeM 256
+#define VecSizeS 128
 #else
 
 #define KPStream gpuStream_t
 #define KPDevice phi::GPUContext
 #define _ptr_
+#define __simd__
 
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
@@ -63,6 +66,9 @@
 #define GRID_NUM_Y gridDim.y
 #define GRID_NUM_Z gridDim.z
 
+#define VecSizeL 4
+#define VecSizeM 2
+#define VecSizeS 1
 #endif
 
 // include file
diff --git a/paddle/phi/kernels/sparse/activation_grad_kernel.cc b/paddle/phi/kernels/sparse/activation_grad_kernel.cc
deleted file mode 100644
index 9eca14e660939..0000000000000
--- a/paddle/phi/kernels/sparse/activation_grad_kernel.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/activation_grad_kernel.h"
-#include "paddle/phi/kernels/activation_grad_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluGradKernel(const Context& dev_ctx,
-                          const SparseCooTensor& x,
-                          const SparseCooTensor& out_grad,
-                          SparseCooTensor* x_grad) {
-  DenseTensor non_zero_indices =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());
-  DenseTensor non_zero_elements =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());
-  phi::Copy(dev_ctx,
-            x.non_zero_indices(),
-            dev_ctx.GetPlace(),
-            false,
-            &non_zero_indices);
-  phi::ReluGradKernel<T, Context>(dev_ctx,
-                                  x.non_zero_elements(),
-                                  out_grad.non_zero_elements(),
-                                  &non_zero_elements);
-  x_grad->SetMember(non_zero_indices, non_zero_elements, x.dims(), true);
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_relu_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluGradKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(sparse_relu_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-#endif
diff --git a/paddle/phi/kernels/sparse/activation_grad_kernel.h b/paddle/phi/kernels/sparse/activation_grad_kernel.h
deleted file mode 100644
index aab4a3e5a590b..0000000000000
--- a/paddle/phi/kernels/sparse/activation_grad_kernel.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/sparse_coo_tensor.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluGradKernel(const Context& dev_ctx,
-                          const SparseCooTensor& x,
-                          const SparseCooTensor& out_grad,
-                          SparseCooTensor* x_grad);
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/activation_kernel.cc b/paddle/phi/kernels/sparse/activation_kernel.cc
deleted file mode 100644
index a1a00897d33cf..0000000000000
--- a/paddle/phi/kernels/sparse/activation_kernel.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/activation_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      SparseCooTensor* out) {
-  DenseTensor non_zero_indices =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());
-  DenseTensor non_zero_elements =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());
-  phi::Copy(dev_ctx,
-            x.non_zero_indices(),
-            dev_ctx.GetPlace(),
-            false,
-            &non_zero_indices);
-  phi::ReluKernel<T, Context>(
-      dev_ctx, x.non_zero_elements(), &non_zero_elements);
-  out->SetMember(non_zero_indices, non_zero_elements, x.dims(), true);
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_relu,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(sparse_relu,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-#endif
diff --git a/paddle/phi/kernels/sparse/activation_kernel.h b/paddle/phi/kernels/sparse/activation_kernel.h
deleted file mode 100644
index 568c0aa8b2ecb..0000000000000
--- a/paddle/phi/kernels/sparse/activation_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      SparseCooTensor* out);
-
-template <typename T, typename Context>
-SparseCooTensor SparseRelu(const Context& dev_ctx, const SparseCooTensor& x) {
-  DenseTensor indices, values;
-  SparseCooTensor coo(indices, values, x.dims());
-  SparseReluKernel<T, Context>(dev_ctx, x, &coo);
-  return coo;
-}
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.cc b/paddle/phi/kernels/sparse/unary_grad_kernel.cc
new file mode 100644
index 0000000000000..1fd3ef2711299
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \
+  namespace phi {                                                           \
+  namespace sparse {                                                        \
+                                                                            \
+  template <typename T, typename Context>                                   \
+  void SparseCoo##DenseKernelFunc(const Context& dev_ctx,                   \
+                                  const SparseCooTensor& x_or_out,          \
+                                  const SparseCooTensor& out_grad,          \
+                                  SparseCooTensor* x_grad) {                \
+    DenseTensor non_zero_indices =                                          \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_indices());   \
+    DenseTensor non_zero_elements =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_elements());  \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_indices(),                                  \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_indices);                                           \
+    phi::DenseKernelFunc<T, Context>(dev_ctx,                               \
+                                     x_or_out.non_zero_elements(),          \
+                                     out_grad.non_zero_elements(),          \
+                                     &non_zero_elements);                   \
+    x_grad->SetMember(                                                      \
+        non_zero_indices, non_zero_elements, x_or_out.dims(), true);        \
+  }                                                                         \
+                                                                            \
+  template <typename T, typename Context>                                   \
+  void SparseCsr##DenseKernelFunc(const Context& dev_ctx,                   \
+                                  const SparseCsrTensor& x_or_out,          \
+                                  const SparseCsrTensor& out_grad,          \
+                                  SparseCsrTensor* out) {                   \
+    DenseTensor non_zero_crows =                                            \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_crows());     \
+    DenseTensor non_zero_cols =                                             \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_cols());      \
+    DenseTensor non_zero_elements =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_elements());  \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_crows(),                                    \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_crows);                                             \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_cols(),                                     \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_cols);                                              \
+    phi::DenseKernelFunc<T, Context>(dev_ctx,                               \
+                                     x_or_out.non_zero_elements(),          \
+                                     out_grad.non_zero_elements(),          \
+                                     &non_zero_elements);                   \
+    out->SetMember(                                                         \
+        non_zero_crows, non_zero_cols, non_zero_elements, x_or_out.dims()); \
+  }                                                                         \
+  }                                                                         \
+  }
+
+#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+                                                                       \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+#else
+// This macro definition is empty when GPU is disabled
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc)
+#endif
+
+#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)   \
+  REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+#define DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(kernel_name,     \
+                                                     DenseKernelFunc) \
+  DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \
+  REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+// NOTE: the following code is to bypass the restriction of Paddle
+// kernel registration mechanism. Do NOT refactor them unless you
+// know what you are doing.
+// If you want to implement any new kernel, please follow `sin_grad`,
+// `tanh_grad` etc, do NOT follow the following `relu_grad`.
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(ReluGradKernel)
+
+PD_REGISTER_KERNEL(sparse_coo_relu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooReluGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+PD_REGISTER_KERNEL(sparse_csr_relu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrReluGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(sparse_coo_relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(sparse_csr_relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+#endif
+
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sin_grad, SinGradKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(tanh_grad, TanhGradKernel)
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.h b/paddle/phi/kernels/sparse/unary_grad_kernel.h
new file mode 100644
index 0000000000000..24ea4fee1a4fd
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+#define DECLARE_SPARSE_UNARY_GRAD_KERNEL(name)                      \
+  template <typename T, typename Context>                           \
+  void SparseCoo##name##GradKernel(const Context& dev_ctx,          \
+                                   const SparseCooTensor& x,        \
+                                   const SparseCooTensor& out_grad, \
+                                   SparseCooTensor* x_grad);        \
+                                                                    \
+  template <typename T, typename Context>                           \
+  void SparseCsr##name##GradKernel(const Context& dev_ctx,          \
+                                   const SparseCsrTensor& x,        \
+                                   const SparseCsrTensor& out_grad, \
+                                   SparseCsrTensor* x_grad);
+
+namespace phi {
+namespace sparse {
+
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Relu)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sin)
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/unary_kernel.cc b/paddle/phi/kernels/sparse/unary_kernel.cc
new file mode 100644
index 0000000000000..e02d7757664fa
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_kernel.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                      \
+  namespace phi {                                                        \
+  namespace sparse {                                                     \
+                                                                         \
+  template <typename T, typename Context>                                \
+  void SparseCoo##DenseKernelFunc(const Context& dev_ctx,                \
+                                  const SparseCooTensor& x,              \
+                                  SparseCooTensor* out) {                \
+    DenseTensor non_zero_indices =                                       \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());       \
+    DenseTensor non_zero_elements =                                      \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());      \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_indices(),                                      \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_indices);                                        \
+    phi::DenseKernelFunc<T, Context>(                                    \
+        dev_ctx, x.non_zero_elements(), &non_zero_elements);             \
+    out->SetMember(non_zero_indices, non_zero_elements, x.dims(), true); \
+  }                                                                      \
+                                                                         \
+  template <typename T, typename Context>                                \
+  void SparseCsr##DenseKernelFunc(const Context& dev_ctx,                \
+                                  const SparseCsrTensor& x,              \
+                                  SparseCsrTensor* out) {                \
+    DenseTensor non_zero_crows =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_crows());         \
+    DenseTensor non_zero_cols =                                          \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_cols());          \
+    DenseTensor non_zero_elements =                                      \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());      \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_crows(),                                        \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_crows);                                          \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_cols(),                                         \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_cols);                                           \
+    phi::DenseKernelFunc<T, Context>(                                    \
+        dev_ctx, x.non_zero_elements(), &non_zero_elements);             \
+    out->SetMember(                                                      \
+        non_zero_crows, non_zero_cols, non_zero_elements, x.dims());     \
+  }                                                                      \
+  }                                                                      \
+  }
+
+#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+                                                                       \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+#else
+// This macro definition is empty when GPU is disabled
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc)
+#endif
+
+#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)   \
+  REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+#define DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                                 \
+  REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+// NOTE: the following code is to bypass the restriction of Paddle
+// kernel registration mechanism. Do NOT refactor them unless you
+// know what you are doing.
+// If you want to implement any new kernel, please follow `sin`,
+// `tanh` etc, do NOT follow `sqrt`.
+DEFINE_SPARSE_UNARY_KERNEL(SqrtKernel)
+
+PD_REGISTER_KERNEL(sparse_coo_sqrt,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooSqrtKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+PD_REGISTER_KERNEL(sparse_csr_sqrt,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrSqrtKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(sparse_coo_sqrt,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooSqrtKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(sparse_csr_sqrt,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrSqrtKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+#endif
+
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(sin, SinKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(tanh, TanhKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(relu, ReluKernel)
diff --git a/paddle/phi/kernels/sparse/unary_kernel.h b/paddle/phi/kernels/sparse/unary_kernel.h
new file mode 100644
index 0000000000000..4470173c143db
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_kernel.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DECLARE_SPARSE_UNARY_KERNEL(name)                                      \
+  template <typename T, typename Context>                                      \
+  void SparseCoo##name##Kernel(                                                \
+      const Context& dev_ctx, const SparseCooTensor& x, SparseCooTensor* out); \
+                                                                               \
+  template <typename T, typename Context>                                      \
+  void SparseCsr##name##Kernel(                                                \
+      const Context& dev_ctx, const SparseCsrTensor& x, SparseCsrTensor* out);
+
+namespace phi {
+namespace sparse {
+
+DECLARE_SPARSE_UNARY_KERNEL(Relu)
+DECLARE_SPARSE_UNARY_KERNEL(Sqrt)
+DECLARE_SPARSE_UNARY_KERNEL(Sin)
+
+template <typename T, typename Context>
+SparseCooTensor SparseRelu(const Context& dev_ctx, const SparseCooTensor& x) {
+  DenseTensor indices, values;
+  SparseCooTensor coo(indices, values, x.dims());
+  SparseCooReluKernel<T, Context>(dev_ctx, x, &coo);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index 13a5a6fd4a449..c760c966b0647 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -95,6 +95,16 @@ KernelSignature ElementwiseFloorDivOpArgumentMapping(
   return KernelSignature("floor_divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
+KernelSignature ElementwiseHeavisideOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("elementwise_heaviside", {"X", "Y"}, {}, {"Out"});
+  }
+  return KernelSignature(
+      "elementwise_heaviside_raw", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
 KernelSignature ElementwisePowOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   int axis = paddle::any_cast<int>(ctx.Attr("axis"));
@@ -208,6 +218,15 @@ KernelSignature ElementwiseMinGradOpArgumentMapping(
   return KernelSignature(
       "minimum_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"});
 }
+
+KernelSignature ElementwiseHeavisideGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_heaviside_grad",
+                         {"X", "Y", "Out@GRAD"},
+                         {"axis"},
+                         {"X@GRAD", "Y@GRAD"});
+}
+
 KernelSignature ElementwisePowGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("elementwise_pow_grad",
@@ -258,6 +277,8 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mod,
                            phi::ElementwiseModOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_floordiv,
                            phi::ElementwiseFloorDivOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside,
+                           phi::ElementwiseHeavisideOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_pow,
                            phi::ElementwisePowOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
@@ -292,5 +313,7 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_max_grad,
                            phi::ElementwiseMaxGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_min_grad,
                            phi::ElementwiseMinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside_grad,
+                           phi::ElementwiseHeavisideGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_pow_grad,
                            phi::ElementwisePowGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/scale_sig.cc b/paddle/phi/ops/compat/scale_sig.cc
index 95deb007d99d9..8061a1fbd610a 100644
--- a/paddle/phi/ops/compat/scale_sig.cc
+++ b/paddle/phi/ops/compat/scale_sig.cc
@@ -30,7 +30,7 @@ namespace phi {
  * The infrt declare like:
  *
  * def PDKEL_Reshape_to_CPU : Pat<
- *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr), // OpMaker arguements
+ *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr), // OpMaker arguments
  *     (PDKEL_ReshapeKernelAttr $x, fn($shape_attr)>;  // Kernel arguments
  * def PDKEL_Reshape_to_CPU : Pat<
  *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr),
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index e2c324a6775c8..0d4ec7bd4f592 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -179,8 +179,18 @@ TEST(API, matmul_double_grad) {
   auto dx_grad = paddle::experimental::full({3, 3}, 2.0);
 
   // 2. test API
-  const auto out = paddle::experimental::matmul_double_grad(
-      x, y, out_grad, dx_grad, {}, false, false);
+  std::vector<std::vector<paddle::experimental::Tensor>> out(
+      3, std::vector<paddle::experimental::Tensor>(1));
+  paddle::experimental::matmul_double_grad(x,
+                                           y,
+                                           out_grad,
+                                           dx_grad,
+                                           {},
+                                           false,
+                                           false,
+                                           &out[0][0],
+                                           &out[1][0],
+                                           &out[2][0]);
 
   // 3. check result
   ASSERT_EQ(out.size(), 3UL);
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index 7c4aa16425907..c00113389adb7 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -77,11 +77,11 @@ void TestConv3dBase(const std::vector<int>& indices,
          kernel.size() * sizeof(T));
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    auto outs = paddle::experimental::sparse::conv3d(
+    auto tensor_out = paddle::experimental::sparse::conv3d(
         x, weight, paddings, dilations, strides, 1, false);
 
-    auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
-        std::get<0>(outs).impl());
+    auto out =
+        std::dynamic_pointer_cast<phi::SparseCooTensor>(tensor_out.impl());
     ASSERT_EQ(correct_out_dims.size(), out->dims().size());
     for (int i = 0; i < correct_out_dims.size(); i++) {
       ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt
index ca6d20045d171..150336a1ed694 100644
--- a/paddle/phi/tests/common/CMakeLists.txt
+++ b/paddle/phi/tests/common/CMakeLists.txt
@@ -2,6 +2,7 @@ cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest)
 cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest)
 cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest)
 cc_test(phi_test_place SRCS test_place.cc DEPS phi_place)
+cc_test(phi_test_int_array SRCS test_int_array.cc DEPS int_array api_int_array phi phi_api)
 if (WITH_GPU)
     nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar)
 endif()
diff --git a/paddle/phi/tests/common/test_int_array.cc b/paddle/phi/tests/common/test_int_array.cc
new file mode 100644
index 0000000000000..b6c4f2b1ea8e3
--- /dev/null
+++ b/paddle/phi/tests/common/test_int_array.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/api/include/api.h"
+
+#include "paddle/phi/api/include/context_pool.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "gtest/gtest.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+#endif
+
+namespace phi {
+namespace tests {
+
+TEST(IntArray, ConstructFromCPUDenseTensor) {
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const auto* dev_ctx =
+      static_cast<const phi::CPUContext*>(pool.Get(CPUPlace()));
+  phi::DenseTensor shape = Full<int>(*dev_ctx, {2}, 3);
+  phi::DenseTensor out = Full<int>(*dev_ctx, shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromCPUDenseTensorVector) {
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const auto* dev_ctx =
+      static_cast<const phi::CPUContext*>(pool.Get(CPUPlace()));
+  phi::DenseTensor shape0 = Full<int>(*dev_ctx, {1}, 3);
+  phi::DenseTensor shape1 = Full<int64_t>(*dev_ctx, {1}, 3);
+  std::vector<phi::DenseTensor> shape{shape0, shape1};
+  phi::DenseTensor out = Full<int>(*dev_ctx, shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromCPUTensor) {
+  auto shape = paddle::experimental::full({2}, 3, DataType::INT64);
+  auto out = paddle::experimental::full(shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromCPUTensorVector) {
+  auto shape0 = paddle::experimental::full({2}, 3, DataType::INT64);
+  auto shape1 = paddle::experimental::full({2}, 3, DataType::INT32);
+
+  std::vector<paddle::experimental::Tensor> shape{shape0, shape0};
+  auto out = paddle::experimental::full(shape, 1);
+
+  std::vector<paddle::experimental::Tensor> shape_new{shape0, shape1};
+  auto out1 = paddle::experimental::full(shape_new, 1);
+
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+
+  ASSERT_EQ(out1.dims().size(), 2);
+  ASSERT_EQ(out1.dims()[0], 3);
+  ASSERT_EQ(out1.dims()[1], 3);
+  ASSERT_EQ(out1.numel(), 9);
+}
+
+TEST(IntArray, ThrowException) {
+  auto shape = paddle::experimental::full({2}, 3, DataType::FLOAT32);
+  auto create_int_array = [&shape]() -> paddle::experimental::IntArray {
+    paddle::experimental::IntArray int_array{shape};
+    return int_array;
+  };
+  ASSERT_ANY_THROW(create_int_array());
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(IntArray, ConstructFromGPUDenseTensor) {
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const auto* dev_ctx =
+      static_cast<const phi::GPUContext*>(pool.Get(GPUPlace()));
+  phi::DenseTensor shape = Full<int>(*dev_ctx, {2}, 3);
+  phi::DenseTensor out = Full<int>(*dev_ctx, shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromGPUDenseTensorVector) {
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const auto* dev_ctx =
+      static_cast<const phi::GPUContext*>(pool.Get(GPUPlace()));
+  phi::DenseTensor shape0 = Full<int>(*dev_ctx, {1}, 3);
+  phi::DenseTensor shape1 = Full<int64_t>(*dev_ctx, {1}, 3);
+  std::vector<phi::DenseTensor> shape{shape0, shape1};
+  phi::DenseTensor out = Full<int>(*dev_ctx, shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromGPUTensor) {
+  auto shape = paddle::experimental::full({2}, 3, DataType::INT64, GPUPlace());
+  auto out = paddle::experimental::full(shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromGPUTensorVector) {
+  auto shape0 = paddle::experimental::full({2}, 3, DataType::INT64, GPUPlace());
+  auto shape1 = paddle::experimental::full({2}, 3, DataType::INT32, GPUPlace());
+
+  std::vector<paddle::experimental::Tensor> shape{shape0, shape0};
+  auto out = paddle::experimental::full(shape, 1);
+
+  std::vector<paddle::experimental::Tensor> shape_new{shape0, shape1};
+  auto out1 = paddle::experimental::full(shape_new, 1);
+
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+
+  ASSERT_EQ(out1.dims().size(), 2);
+  ASSERT_EQ(out1.dims()[0], 3);
+  ASSERT_EQ(out1.dims()[1], 3);
+  ASSERT_EQ(out1.numel(), 9);
+}
+#endif
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
index 43640da270aad..05781156cd1d6 100644
--- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/activation_grad_kernel.h"
-#include "paddle/phi/kernels/sparse/activation_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
 
 namespace phi {
 namespace tests {
@@ -70,7 +70,7 @@ TEST(DEV_API, sparse_relu) {
 
   SparseCooTensor sparse_out_grad(
       sparse_coo.non_zero_indices(), dense_out, {3, 4});
-  sparse::SparseReluGradKernel<float>(
+  sparse::SparseCooReluGradKernel<float>(
       dev_ctx_cpu, sparse_coo, sparse_out_grad, &sparse_grad_x);
 
   cmp = memcmp(dense_grad_x.data<float>(),
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index fe5f2c25ca551..fdcd560658146 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -14,6 +14,8 @@ elseif(WITH_ASCEND_CL)
   SET(PACKAGE_NAME "paddlepaddle-npu")
 elseif(WITH_XPU)
   SET(PACKAGE_NAME "paddlepaddle-xpu")
+elseif(WITH_IPU)
+  SET(PACKAGE_NAME "paddlepaddle-ipu")
 else()
   SET(PACKAGE_NAME "paddlepaddle")
 endif()
diff --git a/python/paddle/README.rst b/python/paddle/README.rst
index e779f1264c451..2d48ee4b26caf 100644
--- a/python/paddle/README.rst
+++ b/python/paddle/README.rst
@@ -88,7 +88,7 @@ If you want to install paddlepaddle-gpu with cuda version of 9.0 ,10.0 ,10.1 ,or
 
 After the installation is complete, you can use `python` or `python3` to enter the Python interpreter and then use `import paddle.fluid` and `fluid.install_check.run_check()`
 
-If `Your Paddle Fluid is installed succesfully!` appears, to verify that the installation was successful.
+If `Your Paddle Fluid is installed successfully!` appears, to verify that the installation was successful.
 
 
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index cb0135d9b4c29..8c2ec1acf072a 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -269,6 +269,7 @@
 from .tensor.math import fmin  # noqa: F401
 from .tensor.math import inner  # noqa: F401
 from .tensor.math import outer  # noqa: F401
+from .tensor.math import heaviside  # noqa: F401
 from .tensor.math import frac  # noqa: F401
 
 from .tensor.random import bernoulli  # noqa: F401
@@ -635,4 +636,5 @@
            'renorm',
            'take_along_axis',
            'put_along_axis',
+           'heaviside',
 ]
diff --git a/python/paddle/distributed/auto_parallel/cost/__init__.py b/python/paddle/distributed/auto_parallel/cost/__init__.py
index 7bc8a81b79f8e..ea6b3bc5b7e76 100644
--- a/python/paddle/distributed/auto_parallel/cost/__init__.py
+++ b/python/paddle/distributed/auto_parallel/cost/__init__.py
@@ -12,9 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .base_cost import OP_COST_FACTORY
+from .base_cost import _g_op_cost_factory
 from .base_cost import Cost
-from .comm_op_cost import AllreduceSumCost
-from .comp_op_cost import MatmulV2OpCost
+from .base_cost import CommContext
+from .base_cost import build_comm_desc
 from .tensor_cost import TensorCost
 from .estimate_cost import CostEstimator
+
+from .comp_op_cost import MatmulV2OpCost
+
+from .comm_op_cost import SendOpCost
+from .comm_op_cost import RecvOpCost
+from .comm_op_cost import IdentityOpCost
+from .comm_op_cost import BroadcastOpCost
+from .comm_op_cost import AllgatherOpCost
+from .comm_op_cost import AllreduceSumOpCost
diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py
index c4ebd836129e2..f1843b8f16527 100644
--- a/python/paddle/distributed/auto_parallel/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py
@@ -13,13 +13,29 @@
 # limitations under the License
 
 from collections import OrderedDict
+from functools import reduce
+
 import paddle
 
+from ..cluster import LinkType
+from ..process_group import get_process_group
+
 COMM_OP_TYPE = [
-    "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum"
+    "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum",
+    "c_identity"
 ]
 NON_COMP_TYPE = ["while"] + COMM_OP_TYPE
-OP_COST_FACTORY = {}
+_g_op_cost_factory = {}
+
+
+def build_comm_desc(op_type, group_ranks, dtype, shape, attrs=None):
+    desc = {}
+    desc["op"] = op_type
+    desc["group_ranks"] = group_ranks
+    desc["inputs"] = {"X": [(dtype, shape)]}
+    if attrs is not None:
+        desc["attrs"] = attrs
+    return desc
 
 
 def _parse_op_to_desc(op, dist_context=None):
@@ -126,66 +142,136 @@ class CommContext:
     _instance = None
     _has_instance = False
 
-    def __init__(self, cluster):
-        if CommContext._has_instance:
-            return
-        self.cluster = cluster
-        self._alpha_base_ring = 8.4
-        self._alpha_base_tree = 0
-        self._alpha_inter = None
-        self._alpha_intra
-        self._beta = {}
-
     def __new__(cls, *args, **kwargs):
         if cls._instance is None:
-            cls._instance = super().__new__(cls, *args, **kwargs)
+            cls._instance = super().__new__(cls)
             _has_instance = True
         return cls._instance
 
-    @property
-    def alpha_inter(self):
-        if self._alpha_inter is None:
-            if cluster.alpha.inter == "NVL":
-                self._alpha_inter = 3.4
-            elif cluster.alpha.inter == "PHB":
-                self._alpha_inter = 5.7
-        return self._alpha_inter
-
-    @property
-    def alpha_intra(self):
-        if self._alpha_intra is None:
-            if cluster.alpha.intra == "NVL":
-                self._alpha_intra = 28
-            elif cluster.alpha.intra == "PHB":
-                self._alpha_intra = 28
-        return self._alpha_intra
-
-    @property
-    def alpha_base_ring(self):
-        return self._alpha_base_ring
-
-    @property
-    def alpha_base_tree(self):
-        return self._alpha_base_tree
-
-    def get_beta(self, ranks):
+    def __init__(self, cluster):
+        if CommContext._has_instance:
+            return
+        self.beta = {}
+        self.hops = {}
+        self.cluster = cluster
+        # if cluster has no info about those vars, it will be set by default
+        self.base_ring = None
+        self.base_tree = None
+        # self.base_inter_ring = None
+        # self.base_inter_tree = None
+        self.intra_ring = None
+        self.intra_tree = None
+        self.inter_ring = None
+        self.inter_tree = None
+        self.switch = None
+        self._post_init()
+
+    def _post_init(self):
+        alpha_latency = self.cluster.alpha_latency
+        if alpha_latency is None:
+            # set default
+            self.base_ring = 8.4
+            self.base_tree = 0.
+            # NVL in default
+            self.intra_ring = 3.4
+            self.intra_tree = 28
+            # NET in default
+            self.inter_ring = 9.6
+            self.inter_tree = 28
+            self.switch = 10.0
+        else:
+            base_ring = alpha_latency.base_ring
+            self.base_ring = base_ring if base_ring is not None else 8.4
+
+            base_tree = alpha_latency.base_tree
+            self.base_tree = base_tree if base_tree is not None else 0.
+
+            intra_ring = alpha_latency.intra_ring
+            if intra_ring == LinkType.NVL:
+                self.intra_ring = 3.4
+            elif intra_ring == LinkType.PHB:
+                self.intra_ring = 5.7
+            elif intra_ring is not None:
+                self.intra_ring = intra_ring
+            else:
+                # NVL Default
+                self.intra_ring = 3.4
+
+            intra_tree = alpha_latency.intra_tree
+            if intra_tree == LinkType.NVL:
+                self.intra_tree = 28
+            elif intra_tree == LinkType.PHB:
+                self.intra_tree = 28
+            elif intra_tree is not None:
+                self.intra_tree = intra_tree
+            else:
+                # NVL Default
+                self.intra_tree = 28
+
+            inter_ring = alpha_latency.inter_ring
+            if inter_ring == LinkType.NET:
+                self.inter_ring = 9.6
+            elif inter_ring is not None:
+                self.inter_ring = inter_ring
+            else:
+                # NET Default
+                self.inter_ring = 9.6
+
+            inter_tree = alpha_latency.inter_tree
+            if inter_tree == LinkType.NET:
+                self.inter_tree = 28
+            elif inter_tree is not None:
+                self.inter_tree = inter_tree
+            else:
+                # NET Default
+                self.inter_tree = 28
+
+            switch = alpha_latency.switch
+            self.switch = switch if switch is not None else 10
+
+            assert self.base_ring is not None
+            assert self.base_tree is not None
+            assert self.intra_ring is not None
+            assert self.intra_tree is not None
+            assert self.inter_ring is not None
+            assert self.inter_tree is not None
+            assert self.switch is not None
+
+    def get_max_beta(self, ranks):
+        # NOTE: Get beta by ring, even in the case of tree such as tree broadcast
+        ranks = self.cluster.convert_rank_to_device_id(ranks)
         key = ','.join(map(str, sorted(ranks)))
         max_beta = None
-        if key in self._beta.keys:
-            max_beta = self._beta[key]
+        if key in self.beta:
+            max_beta = self.beta[key]
         else:
             for i in range(len(ranks)):
                 for j in range(i + 1, len(ranks)):
-                    if min_beta == None:
-                        min_beta = cluster.get_beta(ranks[i], ranks[j])
+                    forward_order_beta = self.cluster.get_beta(ranks[i],
+                                                               ranks[j])
+                    backward_order_beta = self.cluster.get_beta(ranks[j],
+                                                                ranks[i])
+                    beta = forward_order_beta if forward_order_beta > backward_order_beta else backward_order_beta
+                    if max_beta == None:
+                        max_beta = beta
                     else:
-                        beta = cluster.get_beta(ranks[i], ranks[j])
                         if beta > max_beta:
                             max_beta = beta
-            self._beta[key] = max_beta
+            self.beta[key] = max_beta
 
         return max_beta
 
+    def get_hops(self, ranks):
+        key = ','.join(map(str, sorted(ranks)))
+        hops = 0
+        for i in range(len(ranks)):
+            for j in range(i + 1, len(ranks)):
+                hop = self.cluster.get_hop(ranks[i], ranks[j])
+                hops += hop
+        self.hops[key] = hops
+
+        return hops
+
 
 class Cost:
     def __init__(self, time=0, memory=0, flops=0):
@@ -198,11 +284,13 @@ def _check_time(self, val):
 
     def _check_memory(self, val):
         assert isinstance(
-            val, int) and val >= 0, "Memory must be int and greater than 0."
+            val,
+            int) and val >= 0, "Memory must be int and greater than equal to 0."
 
     def _check_flops(self, val):
         assert isinstance(
-            val, int) and val >= 0, "FLOPs must be int and greater than 0."
+            val,
+            int) and val >= 0, "FLOPs must be int and greater than equal to 0."
 
     @property
     def time(self):
@@ -254,7 +342,7 @@ def __init__(self, op=None, op_desc=None):
                                                         op_desc is not None)
         self._op = op
         self._op_desc = op_desc
-        self._cost = self.calc_cost()
+        self._cost = None
 
     @property
     def op(self):
@@ -264,6 +352,18 @@ def op(self):
     def op_desc(self):
         return self._op_desc
 
+    @property
+    def time(self):
+        return self.cost.time
+
+    @property
+    def memory(self):
+        return self.cost.memory
+
+    @property
+    def flops(self):
+        return self.cost.flops
+
     @property
     def cost(self):
         return self._cost
@@ -284,6 +384,40 @@ def calc_cost(self):
         cost = Cost(time, memory, flops)
         return cost
 
+    def __add__(self, rhs):
+        assert isinstance(rhs, (OpCost, Cost))
+        time = 0
+        memory = 0
+        flops = 0
+        if isinstance(rhs, OpCost):
+            time = self.cost.time + rhs.cost.time
+            memory = self.cost.memory + rhs.cost.memory
+            flops = self.cost.flops + rhs.cost.flops
+            assert (time >= 0 and memory >= 0 and flops >= 0)
+        elif isinstance(rhs, Cost):
+            time = self.time + rhs.time
+            memory = self.memory + rhs.memory
+            flops = self.flops + rhs.flops
+            assert (time >= 0 and memory >= 0 and flops >= 0)
+        return Cost(time, memory, flops)
+
+    def __sub__(self, rhs):
+        assert isinstance(rhs, (OpCost, Cost))
+        time = 0
+        memory = 0
+        flops = 0
+        if isinstance(rhs, OpCost):
+            time = self.cost.time - rhs.cost.time
+            memory = self.cost.memory - rhs.cost.memory
+            flops = self.cost.flops - rhs.cost.flops
+            assert (time >= 0 and memory >= 0 and flops >= 0)
+        elif isinstance(rhs, Cost):
+            time = self.time - rhs.time
+            memory = self.memory - rhs.memory
+            flops = self.flops - rhs.flops
+            assert (time >= 0 and memory >= 0 and flops >= 0)
+        return Cost(time, memory, flops)
+
 
 class CommOpCost(OpCost):
     OP_TYPE = "COMM"
@@ -292,11 +426,83 @@ def __init__(self, op=None, op_desc=None, comm_context=None):
         super(CommOpCost, self).__init__(op=op, op_desc=op_desc)
         self._check_comm_op_type()
         self._comm_context = comm_context
+        self._group_ranks = None
+        self._comm_count = None
+        self._hops = None
+        self._rank_count = len(self.group_ranks)
+        self._machine_count = None
+        self._cost = self.calc_cost()
 
     @property
     def comm_context(self):
         return self._comm_context
 
+    @property
+    def comm_count(self):
+        if self._comm_count is None:
+            dtype = None
+            shape = None
+            if self.op is not None:
+                vars = self.op.block.vars
+                # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided
+                var_name = self.op.input("X")[0]
+                var = vars[var_name]
+                dtype = var.dtype
+                shape = var.shape
+            elif self.op_desc is not None:
+                dtype = self.op_desc["inputs"]["X"][0][0]
+                shape = self.op_desc["inputs"]["X"][0][1]
+
+            factor = None
+            if dtype == paddle.float32 or dtype == paddle.int32:
+                factor = 4
+            elif dtype == paddle.int64:
+                factor = 8
+            elif dtype == paddle.uint8:
+                factor = 1
+            elif dtype == paddle.float16:
+                factor = 2
+            else:
+                raise TypeError("This dtype {} is not supported now".format(
+                    dtype))
+            comm_count = reduce(lambda x, y: x * y, shape) * factor
+            self._comm_count = comm_count
+
+        return self._comm_count
+
+    @property
+    def rank_count(self):
+        return self._rank_count
+
+    @property
+    def machine_count(self):
+        if self._machine_count is None:
+            cluster = self._comm_context.cluster
+            self._machine_count = cluster.get_involved_machine_count(
+                self.group_ranks)
+        return self._machine_count
+
+    @property
+    def hops(self):
+        if self._hops is None:
+            self._hops = self.comm_context.get_hops(self.group_ranks)
+        return self._hops
+
+    @property
+    def group_ranks(self):
+        if self._group_ranks is None:
+            if self.op_desc is not None:
+                self._group_ranks = self.op_desc["group_ranks"]
+            elif self.op is not None:
+                ring_id = op.attrs("ring_id")
+                process_group = get_process_group(ring_id)
+                if process_group is None:
+                    raise ValueError(
+                        "There not exists process group whose ring_id is {}.".
+                        format(ring_id))
+                self._group_ranks = process_group.ranks
+        return self._group_ranks
+
     @classmethod
     def _check_comm_op_type(cls):
         if cls.OP_TYPE != "COMM":
@@ -311,6 +517,7 @@ class CompOpCost(OpCost):
     def __init__(self, op=None, op_desc=None, cluster=None):
         super(CompOpCost, self).__init__(op=op, op_desc=op_desc)
         self._check_comp_op_type()
+        self._cost = self.calc_cost()
         self.cluster = cluster
 
     @classmethod
@@ -325,18 +532,22 @@ def register_op_cost(cls):
     op_type = cls.OP_TYPE
 
     def register(op_type):
-        OP_COST_FACTORY[op_type] = cls
+        global _g_op_cost_factory
+        _g_op_cost_factory[op_type] = cls
 
-    return register(op_type)
+    register(op_type)
+    return cls
 
 
-def calc_time_from_model(op=None, desc=None, cluster=None, comm_context=None):
+def calc_time_by_modeling(op=None, desc=None, cluster=None):
     op_type = op.type if op is not None else desc["op"]
     if op_type in COMM_OP_TYPE:
-        op_cost = OP_COST_FACTORY[op_type](op=op,
-                                           op_desc=desc,
-                                           comm_context=comm_context)
+        op_cost = _g_op_cost_factory[op_type](op=op,
+                                              op_desc=desc,
+                                              comm_context=CommContext(cluster))
     elif op_type not in NON_COMP_TYPE:
-        op_cost = OP_COST_FACTORY[op_type](op=op, op_desc=desc, cluster=cluster)
+        op_cost = _g_op_cost_factory[op_type](op=op,
+                                              op_desc=desc,
+                                              cluster=cluster)
     time = op_cost.calc_time()
     return time
diff --git a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
index 359f6b6e7862c..a32fdf1824e62 100644
--- a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,149 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .base_cost import register_op_cost, CommOpCost, OP_COST_FACTORY
+import math
+
+from .base_cost import register_op_cost, CommOpCost, _g_op_cost_factory
 
 
 @register_op_cost
-class AllreduceSumCost(CommOpCost):
+class AllreduceSumOpCost(CommOpCost):
     OP_TYPE = "c_allreduce_sum"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(OP_COST_FACTORY["c_allreduce_sum"], self).__init__(
+        super(AllreduceSumOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        # use tree if cross machine and use ring if in a single machine
+        time = None
+        cluster = self.comm_context.cluster
+        if not cluster.cross_machine(self.group_ranks):
+            time = self.calc_time_ring()
+        else:
+            time = self.calc_time_tree()
+
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        alpha += 2 * (
+            self.rank_count - self.machine_count) * self.comm_context.intra_ring
+        alpha += 2 * (self.machine_count - 1) * (
+            self.comm_context.inter_ring + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + 2 * (self.rank_count - 1
+                            ) / self.rank_count * self.comm_count * beta
+
+        return time
+
+    def calc_time_tree(self):
+        alpha = self.comm_context.base_tree
+        alpha += 2 * (self.rank_count / self.machine_count - 1
+                      ) * self.comm_context.intra_tree
+        alpha += math.log2(self.machine_count) * (
+            self.comm_context.inter_tree + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+
+        time = alpha + 2 * self.comm_count * beta
+
+        return time
+
+
+@register_op_cost
+class AllgatherOpCost(CommOpCost):
+    OP_TYPE = "c_allgather"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(AllgatherOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        time = self.calc_time_ring()
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        alpha += (
+            self.rank_count - self.machine_count) * self.comm_context.intra_ring
+        alpha += (self.machine_count - 1) * (
+            self.comm_context.inter_ring + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + (self.rank_count - 1
+                        ) / self.rank_count * self.comm_count * beta
+        return time
+
+
+@register_op_cost
+class BroadcastOpCost(CommOpCost):
+    OP_TYPE = "c_broadcast"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(BroadcastOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        time = self.calc_time_ring()
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+
+        return time
+
+
+@register_op_cost
+class IdentityOpCost(CommOpCost):
+    OP_TYPE = "c_identity"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(IdentityOpCost, self).__init__(
             op=op, op_desc=op_desc, comm_context=comm_context)
 
     def calc_time(self):
-        # NOTE: The actual formula will be filled in the future.
         return 0
+
+
+@register_op_cost
+class RecvOpCost(CommOpCost):
+    OP_TYPE = "recv_v2"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(RecvOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+        return time
+
+
+@register_op_cost
+class SendOpCost(CommOpCost):
+    OP_TYPE = "send_v2"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(SendOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+
+        return time
diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
index c4d88cb25dc1e..067ad48028d82 100644
--- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .base_cost import Cost, register_op_cost, CompOpCost, OP_COST_FACTORY
+from .base_cost import Cost, register_op_cost, CompOpCost
 
 
 @register_op_cost
@@ -20,7 +20,7 @@ class MatmulV2OpCost(CompOpCost):
     OP_TYPE = "matmul_v2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(OP_COST_FACTORY["matmul_v2"], self).__init__(
+        super(MatmulV2OpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
     # For a concrete COMP OP, the calc_time and calc_flops function needs to be overrided
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index 857f141f30b1f..6fa5b756c75c3 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -485,10 +485,10 @@ def __str__(self):
                                                      self.process_mesh)
 
         for arg_name, tensor_dist_attr in self.inputs_dist_attrs.items():
-            str += "\n\t\t{}'s: {},".format(arg_name, tensor_dist_attr)
+            str += "\n\t\t{}'s (input): {},".format(arg_name, tensor_dist_attr)
 
         for arg_name, tensor_dist_attr in self.outputs_dist_attrs.items():
-            str += "\n\t\t{}'s: {},".format(arg_name, tensor_dist_attr)
+            str += "\n\t\t{}'s (output): {},".format(arg_name, tensor_dist_attr)
 
         str += "\n\t\timpl type: {}, ".format(self._impl_type)
         str += "impl idx: {}".format(self._impl_idx)
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 5082ac987f456..f9d77a0077c56 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -55,10 +55,10 @@ class DistributedContext:
     def __init__(self,
                  serial_main_prog=None,
                  serial_startup_prog=None,
-                 dist_main_progs=None,
-                 dist_startup_progs=None,
-                 serial_loss=None,
                  serial_optimizer=None,
+                 serial_loss=None,
+                 feed_vars=None,
+                 fetch_vars=None,
                  strategy=None):
         # Data members related to original programs (unchanged)
         self._original_serial_main_program = serial_main_prog
@@ -75,8 +75,10 @@ def __init__(self,
         # Data members related to programs (changed)
         self._serial_main_program = None
         self._serial_startup_program = None
-        self._serial_loss = None
-        self._serial_optimizer = None
+        self._serial_loss = serial_loss
+        self._serial_optimizer = serial_optimizer
+        self._serial_feed_vars = feed_vars
+        self._serial_fetch_vars = fetch_vars
 
         # Data members related to the program
         self._dist_tensors_for_program = {}
@@ -92,12 +94,8 @@ def __init__(self,
 
         # Data members related to the distributed programs
         # Distributed programs
-        self._dist_main_programs = dist_main_progs
-        if not self._dist_main_programs:
-            self._dist_main_programs = {}
-        self._dist_startup_programs = dist_startup_progs
-        if not self._dist_startup_programs:
-            self._dist_startup_programs = {}
+        self._dist_main_programs = {}
+        self._dist_startup_programs = {}
 
         # Distributed Strategy
         self._strategy = strategy
@@ -132,34 +130,26 @@ def serial_main_program(self, program):
     def serial_startup_program(self):
         return self._serial_startup_program
 
-    # @serial_startup_program.setter
-    # def serial_startup_program(self, serial_startup_program):
-    #     self._serial_startup_program = serial_startup_program
-
     @property
     def serial_loss(self):
         return self._serial_loss
 
-    # @serial_loss.setter
-    # def serial_loss(self, serial_loss):
-    #     self._serial_loss = serial_loss
-
     @property
     def serial_optimizer(self):
         return self._serial_optimizer
 
-    # @serial_optimizer.setter
-    # def serial_optimizer(self, serial_optimizer):
-    #     self._serial_optimizer = serial_optimizer
+    @property
+    def serial_feed_vars(self):
+        return self._serial_feed_vars
+
+    @property
+    def serial_fetch_vars(self):
+        return self._serial_fetch_vars
 
     @property
     def strategy(self):
         return self._strategy
 
-    # @strategy.setter
-    # def strategy(self, strategy):
-    #     self._strategy = strategy
-
     @property
     def serial_graph(self):
         return self._serial_graph
@@ -678,7 +668,7 @@ def validate_dist_attr_for_program(self):
                         dist_op.serial_op.type)
                 if (dist_op is not None) and (not dist_op.validate_dist_attr()):
                     assert False, "Operator {} has a wrong distributed attributes {}.".format(
-                        dist_op.serial_op.type, dist_tensor.dist_attr)
+                        dist_op.serial_op.type, dist_op.dist_attr)
         return True
 
     def __deepcopy__(self, memo):
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index ea6aeb513ffb9..b9ee6d93fd209 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -34,12 +34,9 @@
 from paddle.distributed.utils import get_logger
 from paddle.distributed.passes import new_pass, PassContext
 
-from .mapper import mapping
 from .cluster import Cluster
-from .reshard import Resharder
-from .planner import Planner
-from .completion import Completer
-from .partitioner import Partitioner
+from .planner_v2 import Planner
+from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
 from .dist_saver import DistributedSaver
 from .dist_loader import NonIterableGeneratorLoader
@@ -79,7 +76,6 @@ def __init__(self,
         self._dist_main_progs = defaultdict(dict)  # dist main programs
         self._dist_startup_progs = defaultdict(dict)  # dist startup programs
         self._dist_contexts = {}
-        self._pass_contexts = {}
         self._feed_vars = {}
         self._fetch_vars = {}
 
@@ -94,10 +90,27 @@ def prepare(self,
         self._loss = loss
         self._metrics = to_list(metrics)
         self._mode = mode
-        self._build(mode)  # build forward program
-        self._plan(mode)  # completion & planner
-        self._parallel(mode, all_ranks)  # parallel
-        self._initialize(mode)  # init comm and startup program
+        # Build forward program
+        self._build(mode)
+        # Do the planning process
+        planner = Planner(mode, self._dist_contexts[mode])
+        planner.plan()
+        # Parallelize program based on the planner's results
+        # For now, the completer has to be passed to the planner,
+        # because we may use it to complete the annotation of the backwarkward and update.
+        parallelizer = Parallelizer(mode, planner.completer,
+                                    self._dist_contexts[mode])
+        if not all_ranks:
+            parallelizer.parallel(self._cur_rank)
+        else:
+            parallelizer.parallel_all()
+        # Get the distributed main programs and startup programs
+        self._dist_main_progs[mode] = self._dist_contexts[
+            mode].dist_main_programs
+        self._dist_startup_progs[mode] = self._dist_contexts[
+            mode].dist_startup_programs
+        # Init comm and startup program
+        self._initialize(mode)
 
     def _build(self, mode):
         serial_main_prog = self._serial_main_progs.get(mode, None)
@@ -133,34 +146,9 @@ def _build(self, mode):
         self._serial_main_progs[mode] = serial_main_prog
         self._serial_startup_progs[mode] = serial_startup_prog
         self._dist_contexts[mode] = DistributedContext(
-            serial_main_prog, serial_startup_prog, self._dist_main_progs[mode],
-            self._dist_startup_progs[mode])
-        self._pass_contexts[mode] = PassContext()
-
-    def _plan(self, mode):
-
-        # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
-        # dependency of backward-forward ops in forward completition.
-        defualt_ctx = get_default_distributed_context()
-        self._dist_contexts[mode]._dist_op_context = defualt_ctx.dist_op_context
-
-        # Complete the distributed annotation
-        serial_main_prog = self._serial_main_progs[mode]
-        self._completer = Completer(self._dist_contexts[mode])
-        self._completer.complete_forward_annotation(serial_main_prog)
-        # TODO: add auto planner process
-        # parse forward sub block
-        self._dist_contexts[mode].block_state.parse_forward_blocks(
-            serial_main_prog)
-
-    def _parallel(self, mode, all_ranks=False):
-        if not all_ranks:
-            self._parallel_program(mode, self._cur_rank)
-        else:
-            world_process_group = get_world_process_group()
-            all_ranks = world_process_group.ranks
-            for rank in all_ranks:
-                self._parallel_program(mode, rank)
+            self._serial_main_progs[mode], self._serial_startup_progs[mode],
+            self._optimizer, losses, self._feed_vars[mode],
+            self._fetch_vars[mode], self.strategy)
 
     def _initialize(self, mode):
         if self._nranks > 1:
@@ -189,131 +177,6 @@ def _initialize(self, mode):
                 prune_startup_prog = dist_startup_prog._prune(uninitialized)
                 self._executor.run(prune_startup_prog)
 
-    def _parallel_program(self, mode, rank):
-        serial_main_program = self._serial_main_progs[mode]
-        serial_startup_program = self._serial_startup_progs[mode]
-        dist_context = self._dist_contexts[mode]
-        if mode == "train" and self._optimizer:
-            # Generate backward
-            serial_loss = self._fetch_vars[mode]["loss"][0]
-            params_grads = self._generate_backward(
-                serial_main_program, serial_startup_program, serial_loss)
-            # Apply pre optimization passes
-            self._apply_pre_optimization(serial_main_program,
-                                         serial_startup_program, serial_loss,
-                                         params_grads)
-            # Do logical partition
-            partitioner = Partitioner(dist_context, rank)
-            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
-                serial_main_program, serial_startup_program, params_grads)
-            # Generate optimizer
-            self._generate_optimizer(dist_main_prog, dist_startup_prog,
-                                     dist_params_grads)
-            # Do reshard process
-            set_grad_var_shape(dist_main_prog, dist_context)
-            make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
-            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
-                                  dist_context, dist_params_grads)
-            resharder.reshard()
-            # Apply post optimization passes
-            self._apply_post_optimization(dist_main_prog, dist_startup_prog,
-                                          rank, dist_params_grads)
-        else:
-            # Apply pre optimization passes
-            self._apply_pre_optimization(serial_main_program,
-                                         serial_startup_program, None, None)
-            # Do logical partition
-            partitioner = Partitioner(dist_context, rank)
-            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
-                serial_main_program, serial_startup_program, [])
-            # Do reshard process
-            make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
-            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
-                                  dist_context, [], 1)
-            resharder.reshard()
-
-        # clone program for test
-        if mode != 'train':
-            dist_main_prog = dist_main_prog.clone(for_test=True)
-            dist_startup_prog = dist_startup_prog.clone(for_test=True)
-
-        self._dist_main_progs[mode][rank] = dist_main_prog
-        self._dist_startup_progs[mode][rank] = dist_startup_prog
-
-    def _generate_backward(self, main_program, startup_program, loss):
-        with program_guard(main_program, startup_program):
-            params_grads = append_backward(
-                loss,
-                distop_context=self._dist_contexts[self.mode].dist_op_context)
-        self._completer.complete_backward_annotation(main_program)
-        self._dist_contexts[self.mode].block_state.parse_backward_blocks(
-            main_program)
-        return params_grads
-
-    def _generate_optimizer(self, main_program, startup_program, params_grads):
-        with program_guard(main_program, startup_program):
-            optimizer_ops = copy.deepcopy(self._optimizer).apply_gradients(
-                params_grads)
-        self._completer.complete_update_annotation(main_program)
-        return optimizer_ops
-
-    def _apply_pre_optimization(self, main_program, startup_program, loss,
-                                params_grads):
-
-        # apply amp pass
-        if self.strategy.amp:
-            config = copy.deepcopy(self.strategy.amp_configs)
-            config["dist_context"] = self._dist_contexts[self.mode]
-            config["params_grads"] = params_grads
-            config["loss"] = loss
-            config["input_data"] = self._feed_vars[self.mode][
-                "inputs"] + self._feed_vars[self.mode]["labels"]
-            if config["use_pure_fp16"]:
-                config["base_opt"] = self._optimizer
-                auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
-                auto_parallel_fp16_pass.apply([main_program],
-                                              [startup_program],
-                                              self._pass_contexts[self.mode])
-            else:
-                auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
-                auto_parallel_amp_pass.apply([main_program], [startup_program],
-                                             self._pass_contexts[self.mode])
-
-        # apply recompute pass
-        if self.strategy.recompute:
-            config = copy.deepcopy(self.strategy.recompute_configs)
-            config["dist_context"] = self._dist_contexts[self.mode]
-            config["no_grad_set"] = None
-            config["loss"] = loss
-            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
-                                                    config)
-            auto_parallel_recompute_pass.apply([main_program],
-                                               [startup_program],
-                                               self._pass_contexts[self.mode])
-
-    def _apply_post_optimization(self, main_program, startup_program, rank,
-                                 params_grads):
-        if self.strategy.sharding:
-            config = copy.deepcopy(self.strategy.sharding_configs)
-            config["dist_context"] = self._dist_contexts[self.mode]
-            config["params_grads"] = params_grads
-            config["global_rank"] = rank
-            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
-                                                   config)
-            auto_parallel_sharding_pass.apply([main_program],
-                                              [startup_program],
-                                              self._pass_contexts[self.mode])
-
-        if self.strategy.gradient_merge:
-            config = copy.deepcopy(self.strategy.gradient_merge_configs)
-            config["dist_context"] = self._dist_contexts[self.mode]
-            config["params_grads"] = params_grads
-            auto_parallel_gradient_merge_pass = new_pass(
-                "auto_parallel_gradient_merge_pass", config)
-            auto_parallel_gradient_merge_pass.apply(
-                [main_program], [startup_program],
-                self._pass_contexts[self.mode])
-
     def fit(self,
             train_data,
             batch_size=1,
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 0696b728d161b..563d247af3bb2 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -201,10 +201,8 @@ def update_dims_mapping(self, dist_op):
         changed = False
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
-        # The following statement will be replaced by a more elegent way
-        if op_desc.type() == "shape" \
-            or op_desc.type() == "slice" \
-                or op_desc.type() == "while":
+
+        if op_desc.type() == "while":
             return False
 
         input_names = op_desc.input_names()
@@ -273,6 +271,8 @@ def update_dims_mapping(self, dist_op):
                 )[0])
                 if input_tensor.is_parameter:
                     continue
+            if op_desc.type() in ["shape", "slice"]:
+                continue
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
index aac7f16b6909b..78589afc498ee 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
@@ -80,12 +80,20 @@ def is_output_compatible(self, dist_op):
         op_dist_attr = dist_op.dist_attr
         dims_mapping_list = []
         output_arg_names = op_desc.output_arg_names()
+        max_dims_mapping_len = -1
         for arg_name in output_arg_names:
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+            if max_dims_mapping_len < len(dims_mapping):
+                max_dims_mapping_len = len(dims_mapping)
             dims_mapping_list.append(dims_mapping)
 
-        if compute_compatible_dims_mapping(dims_mapping_list) is None:
-            return False
+        for idx in range(max_dims_mapping_len):
+            dim_mappings = []
+            for dims_mapping in dims_mapping_list:
+                if idx < len(dims_mapping):
+                    dim_mappings.append(dims_mapping[-(idx + 1)])
+            if compute_compatible_dim_mapping(dim_mappings) is None:
+                return False
         return True
 
     def is_auto_compatible(self, dist_op):
@@ -94,19 +102,26 @@ def is_auto_compatible(self, dist_op):
             return False
         op_dist_attr = dist_op.dist_attr
         dims_mapping_list = []
+
         input_arg_names = op_desc.input_arg_names()
-        max_dims_mapping_len = -1
+        input_max_dims_mapping_len = -1
         for arg_name in input_arg_names:
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if max_dims_mapping_len < len(dims_mapping):
-                max_dims_mapping_len = len(dims_mapping)
+            if input_max_dims_mapping_len < len(dims_mapping):
+                input_max_dims_mapping_len = len(dims_mapping)
             dims_mapping_list.append(dims_mapping)
+
         output_arg_names = op_desc.output_arg_names()
+        output_max_dims_mapping_len = -1
         for arg_name in output_arg_names:
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            assert len(dims_mapping) == max_dims_mapping_len
+            if output_max_dims_mapping_len < len(dims_mapping):
+                output_max_dims_mapping_len = len(dims_mapping)
             dims_mapping_list.append(dims_mapping)
 
+        assert input_max_dims_mapping_len == output_max_dims_mapping_len
+        max_dims_mapping_len = input_max_dims_mapping_len
+
         for idx in range(max_dims_mapping_len):
             dim_mappings = []
             for dims_mapping in dims_mapping_list:
@@ -121,35 +136,58 @@ def update_dims_mapping(self, dist_op):
         changed = False
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
+        dims_mapping_list = []
+
         input_arg_names = op_desc.input_arg_names()
         input_dims_mapping_dict = {}
         input_dims_mapping_lens = {}
-        max_dims_mapping_len = -1
+        input_max_dims_mapping_len = -1
         for arg_name in input_arg_names:
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if max_dims_mapping_len < len(dims_mapping):
-                max_dims_mapping_len = len(dims_mapping)
+            if input_max_dims_mapping_len < len(dims_mapping):
+                input_max_dims_mapping_len = len(dims_mapping)
             input_dims_mapping_dict[arg_name] = dims_mapping
             input_dims_mapping_lens[arg_name] = len(dims_mapping)
-
-        dims_mapping_list = []
         for arg_name in input_arg_names:
-            if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
-                new_dims_mapping = [-1 for _ in range(max_dims_mapping_len)]
+            if input_dims_mapping_lens[arg_name] < input_max_dims_mapping_len:
+                new_dims_mapping = [
+                    -1 for _ in range(input_max_dims_mapping_len)
+                ]
                 for i in range(input_dims_mapping_lens[arg_name]):
-                    new_idx = (max_dims_mapping_len -
+                    new_idx = (input_max_dims_mapping_len -
                                input_dims_mapping_lens[arg_name]) + i
                     new_dims_mapping[new_idx] = input_dims_mapping_dict[
                         arg_name][i]
                 dims_mapping_list.append(new_dims_mapping)
             else:
                 dims_mapping_list.append(input_dims_mapping_dict[arg_name])
+
         output_arg_names = op_desc.output_arg_names()
+        output_dims_mapping_dict = {}
+        output_dims_mapping_lens = {}
+        output_max_dims_mapping_len = -1
         for arg_name in output_arg_names:
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            assert len(dims_mapping) == max_dims_mapping_len
-            dims_mapping_list.append(dims_mapping)
+            if output_max_dims_mapping_len < len(dims_mapping):
+                output_max_dims_mapping_len = len(dims_mapping)
+            output_dims_mapping_dict[arg_name] = dims_mapping
+            output_dims_mapping_lens[arg_name] = len(dims_mapping)
+        for arg_name in output_arg_names:
+            if output_dims_mapping_lens[arg_name] < output_max_dims_mapping_len:
+                new_dims_mapping = [
+                    -1 for _ in range(output_max_dims_mapping_len)
+                ]
+                for i in range(output_dims_mapping_lens[arg_name]):
+                    new_idx = (output_max_dims_mapping_len -
+                               output_dims_mapping_lens[arg_name]) + i
+                    new_dims_mapping[new_idx] = output_dims_mapping_dict[
+                        arg_name][i]
+                dims_mapping_list.append(new_dims_mapping)
+            else:
+                dims_mapping_list.append(output_dims_mapping_dict[arg_name])
 
+        assert input_max_dims_mapping_len == output_max_dims_mapping_len
+        max_dims_mapping_len = input_max_dims_mapping_len
         compatible_dims_mapping = compute_compatible_dims_mapping(
             dims_mapping_list)
         if compatible_dims_mapping is None:
@@ -175,11 +213,24 @@ def update_dims_mapping(self, dist_op):
                     changed = True
 
         for arg_name in output_arg_names:
-            dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if compatible_dims_mapping != dims_mapping:
-                op_dist_attr.set_output_dims_mapping(arg_name,
-                                                     compatible_dims_mapping)
-                changed = True
+            if output_dims_mapping_lens[arg_name] < max_dims_mapping_len:
+                new_dims_mapping = [
+                    -1 for _ in range(output_dims_mapping_lens[arg_name])
+                ]
+                for i in range(output_dims_mapping_lens[arg_name]):
+                    new_idx = (max_dims_mapping_len -
+                               output_dims_mapping_lens[arg_name]) + i
+                    new_dims_mapping[i] = compatible_dims_mapping[new_idx]
+                if new_dims_mapping != output_dims_mapping_dict[arg_name]:
+                    op_dist_attr.set_output_dims_mapping(arg_name,
+                                                         new_dims_mapping)
+                    changed = True
+            else:
+                if compatible_dims_mapping != output_dims_mapping_dict[
+                        arg_name]:
+                    op_dist_attr.set_output_dims_mapping(
+                        arg_name, compatible_dims_mapping)
+                    changed = True
 
         return changed
 
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
new file mode 100644
index 0000000000000..401b423638cde
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -0,0 +1,172 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import defaultdict
+
+from paddle.fluid import program_guard
+from paddle.fluid.backward import append_backward
+from paddle.distributed.passes import new_pass
+
+from .reshard import Resharder
+from .partitioner import Partitioner
+from .dist_op import DistributedOperator
+from .dist_saver import DistributedSaver
+from .dist_loader import NonIterableGeneratorLoader
+from .utils import make_data_unshard, set_grad_var_shape
+from .utils import print_program_with_dist_attr, to_list
+from .process_group import get_all_process_groups, get_world_process_group
+from .dist_context import DistributedContext, get_default_distributed_context
+
+
+class Parallelizer:
+    def __init__(self, mode, completer, dist_context):
+        self._mode = mode
+        self._completer = completer
+        self._dist_context = dist_context
+        self._dist_context.initialize()
+        self._pass_context = self._dist_context.pass_context
+        self._strategy = self._dist_context.strategy
+
+    def parallel_all(self):
+        world_process_group = get_world_process_group()
+        all_ranks = world_process_group.ranks
+        for rank in all_ranks:
+            self.parallel(rank)
+
+    def parallel(self, rank):
+        serial_main_program = self._dist_context.serial_main_program
+        serial_startup_program = self._dist_context.serial_startup_program
+        serial_optimizer = self._dist_context.serial_optimizer
+        if self._mode == "train" and serial_optimizer:
+            # Generate backward
+            serial_loss = self._dist_context.serial_fetch_vars["loss"][0]
+            params_grads = self._generate_backward(
+                serial_main_program, serial_startup_program, serial_loss)
+            # Apply pre optimization passes
+            self._apply_pre_optimization(serial_main_program,
+                                         serial_startup_program, serial_loss,
+                                         serial_optimizer, params_grads)
+            # Do logical partition
+            partitioner = Partitioner(self._dist_context, rank)
+            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
+                serial_main_program, serial_startup_program, params_grads)
+            # Generate optimizer
+            self._generate_optimizer(dist_main_prog, dist_startup_prog,
+                                     serial_optimizer, dist_params_grads)
+            # Do reshard process
+            set_grad_var_shape(dist_main_prog, self._dist_context)
+            make_data_unshard(dist_main_prog, dist_startup_prog,
+                              self._dist_context)
+            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
+                                  self._dist_context, dist_params_grads)
+            resharder.reshard()
+            # Apply post optimization passes
+            self._apply_post_optimization(dist_main_prog, dist_startup_prog,
+                                          rank, dist_params_grads)
+        else:
+            # Apply pre optimization passes
+            self._apply_pre_optimization(
+                serial_main_program, serial_startup_program, None, None, None)
+            # Do logical partition
+            partitioner = Partitioner(self._dist_context, rank)
+            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
+                serial_main_program, serial_startup_program, [])
+            # Do reshard process
+            make_data_unshard(dist_main_prog, dist_startup_prog,
+                              self._dist_context)
+            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
+                                  self._dist_context, [], 1)
+            resharder.reshard()
+
+        # Clone program for test
+        if self._mode != 'train':
+            dist_main_prog = dist_main_prog.clone(for_test=True)
+            dist_startup_prog = dist_startup_prog.clone(for_test=True)
+
+        # Store the distributed programs for further usages
+        self._dist_context.dist_main_programs[rank] = dist_main_prog
+        self._dist_context.dist_startup_programs[rank] = dist_startup_prog
+
+    def _generate_backward(self, main_program, startup_program, loss):
+        with program_guard(main_program, startup_program):
+            params_grads = append_backward(
+                loss, distop_context=self._dist_context.dist_op_context)
+        self._completer.complete_backward_annotation(main_program)
+        self._dist_context.block_state.parse_backward_blocks(main_program)
+        return params_grads
+
+    def _generate_optimizer(self, main_program, startup_program, optimizer,
+                            params_grads):
+        with program_guard(main_program, startup_program):
+            optimizer_ops = copy.deepcopy(optimizer).apply_gradients(
+                params_grads)
+        self._completer.complete_update_annotation(main_program)
+        return optimizer_ops
+
+    def _apply_pre_optimization(self, main_program, startup_program, loss,
+                                optimizer, params_grads):
+        if self._strategy is None:
+            return
+        # apply amp pass
+        if self._strategy.amp:
+            config = copy.deepcopy(self._strategy.amp_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            config["loss"] = loss
+            config["input_data"] = self._dist_context.serial_feed_vars["inputs"] \
+                + self._dist_context.serial_feed_vars["labels"]
+            if config["use_pure_fp16"]:
+                config["base_opt"] = optimizer
+                auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
+                auto_parallel_fp16_pass.apply(
+                    [main_program], [startup_program], self._pass_context)
+            else:
+                auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
+                auto_parallel_amp_pass.apply([main_program], [startup_program],
+                                             self._pass_context)
+
+        # apply recompute pass
+        if self._strategy.recompute:
+            config = copy.deepcopy(self._strategy.recompute_configs)
+            config["dist_context"] = self._dist_context
+            config["no_grad_set"] = None
+            config["loss"] = loss
+            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
+                                                    config)
+            auto_parallel_recompute_pass.apply(
+                [main_program], [startup_program], self._dist_context)
+
+    def _apply_post_optimization(self, main_program, startup_program, rank,
+                                 params_grads):
+        if self._strategy is None:
+            return
+        if self._strategy.sharding:
+            config = copy.deepcopy(self._strategy.sharding_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            config["global_rank"] = rank
+            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
+                                                   config)
+            auto_parallel_sharding_pass.apply(
+                [main_program], [startup_program], self._dist_context)
+
+        if self._strategy.gradient_merge:
+            config = copy.deepcopy(self._strategy.gradient_merge_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            auto_parallel_gradient_merge_pass = new_pass(
+                "auto_parallel_gradient_merge_pass", config)
+            auto_parallel_gradient_merge_pass.apply(
+                [main_program], [startup_program], self._dist_context)
diff --git a/python/paddle/distributed/auto_parallel/planner.py b/python/paddle/distributed/auto_parallel/planner.py
index 73df0da10339e..b97c09bd59da8 100755
--- a/python/paddle/distributed/auto_parallel/planner.py
+++ b/python/paddle/distributed/auto_parallel/planner.py
@@ -35,7 +35,6 @@
 from .dist_context import DistributedContext, DistributedOperatorContext
 from .dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 
-paddle.enable_static()
 paddle.seed(123)
 random.seed(123)
 np.random.seed(123)
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py
new file mode 100755
index 0000000000000..7db17e98d07ee
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .completion import Completer
+from .dist_context import get_default_distributed_context
+from .utils import print_program_with_dist_attr
+
+
+class Planner:
+    def __init__(self, mode, dist_context):
+        self._mode = mode
+        self._dist_context = dist_context
+
+        # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
+        # dependency of backward-forward ops in forward completion.
+        default_ctx = get_default_distributed_context()
+        self._dist_context._dist_op_context = default_ctx.dist_op_context
+        self._dist_context.initialize()
+
+        self._completer = Completer(self._dist_context)
+
+    @property
+    def completer(self):
+        return self._completer
+
+    def plan(self):
+        self._completer.complete_forward_annotation()
+        # parse forward sub block
+        self._dist_context.block_state.parse_forward_blocks(
+            self._dist_context.serial_main_program)
+        # TODO: add the auto searcher
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 471448b031dde..d1b6e57ddc123 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -156,6 +156,6 @@ def __str__(self):
 
 
 # Note that Process group 0 is reserved for representing all ranks.
-# At the begining, group 0 is empty and new ranks will be added automatically. 
+# At the beginning, group 0 is empty and new ranks will be added automatically. 
 _g_process_group_map = {}
 _g_process_group_map[0] = ProcessGroup(0, [])
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index e33a3dba669ab..a781f314d3f20 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -226,9 +226,15 @@ def _new_process_group_impl(backend,
                             world_size,
                             group_name,
                             pg_options,
-                            group_id=0):
+                            group_id=0,
+                            src_rank=None,
+                            dst_rank=None):
     pg = None
     genv = _get_global_env()
+    if backend != 'heter':
+        assert src_rank is None and dst_rank is None, (
+            "src_rank and dst_rank "
+            "can only be set for heter backend.")
     assert backend in _valid_backend_list, "Unsupported backend: %s." % backend
     if backend == "gloo":
         place = core.CPUPlace()
@@ -269,7 +275,9 @@ def _new_process_group_impl(backend,
             gloo_rank=cluster_id,
             gloo_size=len(cluster_size),
             with_switch=True,
-            switch_endpoint=switch_ep)
+            switch_endpoint=switch_ep,
+            src_rank=src_rank,
+            dst_rank=dst_rank)
 
     return pg
 
@@ -322,6 +330,16 @@ def barrier(group=None):
         attrs={'ring_id': ring_id})
 
 
+# _custom_gid provides a way for users to
+# set the group id, which is usually useful
+# to be compatible with the static mode.
+_custom_gid = None
+
+
+def _set_custom_gid(gid):
+    _custom_gid = gid
+
+
 def new_group(ranks=None, backend=None):
     """
 
@@ -348,9 +366,9 @@ def new_group(ranks=None, backend=None):
     global _group_map
     if in_dygraph_mode():
         global _default_group_name
-        gid = _new_ring_id()
+        gid = _custom_gid if _custom_gid else _new_ring_id()
         group_name = _default_group_name + str(gid)
-        if ranks is None or len(ranks) > 1:
+        if backend != 'heter' and (ranks is None or len(ranks) > 1):
             global_group = _get_default_group()
             global_rank = global_group.rank
             global_ranks = global_group.ranks
@@ -362,8 +380,10 @@ def new_group(ranks=None, backend=None):
                 "equal to that of the default global group.")
         size = len(ranks)
         ranks = sorted(ranks)
-        if size > 1 and global_rank in ranks:
-            rank = ranks.index(global_rank)
+        if backend == 'heter' or (size > 1 and global_rank in ranks):
+            rank = 0 if backend == 'heter' else ranks.index(global_rank)
+            src_rank = ranks[0] if backend == 'heter' else None
+            dst_rank = ranks[1] if backend == 'heter' else None
             pg = _new_process_group_impl(
                 backend,
                 _default_store,
@@ -371,7 +391,9 @@ def new_group(ranks=None, backend=None):
                 size,
                 group_name,
                 pg_options=None,
-                group_id=gid)
+                group_id=gid,
+                src_rank=src_rank,
+                dst_rank=dst_rank)
         else:
             rank = -1
             pg = None
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 9d20e432d8961..986d8e401e872 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1168,9 +1168,9 @@ def sharding_configs(self):
 
             dp_degree(int, optional): specific the number of data parallelism group; when dp_degree >= 2, it will introduce dp_degree ways data parallelism as the outer parallelsim for the inner parallelsim. User is responsible to ensure global_world_size = mp_degree * sharding_degree * pp_degree * dp_degree. Default is 1.
 
-            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
+            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
 
-            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
+            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
 
             pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
             This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
@@ -1485,7 +1485,7 @@ def localsgd_configs(self):
 
         **Notes**:
             k_steps(int) The local steps for training before parameter synchronization. Default 1.
-            begin_step(int) The step of begining training by localsgd. Default 1.
+            begin_step(int) The step of beginning training by localsgd. Default 1.
 
         Examples:
 
@@ -1544,7 +1544,7 @@ def adaptive_localsgd_configs(self):
             init_k_steps(int) The initial steps for training before adaptive localsgd.
                               Then, the adaptive localsgd method will modify init_k_steps automatically.
                               Default 1.
-            begin_step(int) The step of begining training by adaptive localsgd. Default 1.
+            begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
 
         Examples:
 
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index c5a9df50589cc..343cca7f4f0d3 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -556,7 +556,7 @@ def launch():
 
         - ``--selected_mlus``: mlus aliases, recommend to use ``--mlus``.
 
-        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
         - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
index 8f1a4de86de0d..3a52041dc7e2c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -1372,7 +1372,7 @@ def _apply(self):
         max_v = self.op.attr("max")
         seed = self.op.attr("seed")
         dtype = self.op.attr("dtype")
-        assert max_v > min_v, "assert max_v > min_v, but recieved " + \
+        assert max_v > min_v, "assert max_v > min_v, but received " + \
                "as max_v={}, min_v={} ".format(max_v, min_v)
 
         tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index b7edf5830025d..d487f35324df9 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -170,7 +170,7 @@ def minimize(self,
         result = self._inner_optimizer.minimize(loss, startup_program,
                                                 parameters, no_grad_set)
 
-        # sync parameters accross sharding ranks
+        # sync parameters across sharding ranks
         self._sharding_sync_parameters()
 
         return result
@@ -181,7 +181,7 @@ def step(self):
         # actually updating
         self._inner_optimizer.step()
 
-        # sync parameters accross sharding ranks
+        # sync parameters across sharding ranks
         self._sharding_sync_parameters()
 
     # TODO is it a good way to make _grad_clip a property
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index c4d42f90615fc..90440ff9d0ea9 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -138,9 +138,16 @@ def _get_hybrid_degree(self):
         if pp_degree > 1:
             assert strategy.pipeline is True
 
-        assert global_world_size == mp_degree * sharding_degree * pp_degree * dp_degree, \
-            "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
-                global_world_size, mp_degree, sharding_degree, pp_degree, dp_degree)
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
+            assert pp_degree == 2, ("For manually set pipeline, only "
+                                    "pp_degree = 2 is supported.")
+            assert global_world_size == mp_degree * sharding_degree * dp_degree, \
+                "global work size [{}], mp_degree [{}], sharding_degree [{}], dp_degree [{}].".format(
+                    global_world_size, mp_degree, sharding_degree, dp_degree)
+        else:
+            assert global_world_size == mp_degree * sharding_degree * pp_degree * dp_degree, \
+                "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
+                    global_world_size, mp_degree, sharding_degree, pp_degree, dp_degree)
 
         # FIXME (JZ-LIANG) deprecated hybrid_dp
         if sharding_configs["hybrid_dp"]:
@@ -268,7 +275,11 @@ def _inner_opt_minimize(self, loss, startup_program, parameter_list,
         if self.pp_degree > 1:
             startup_program = startup_program._pipeline_opt['startup_program']
             print("pp_rank:", self.pp_rank)
-            main_program = program_list[self.pp_rank]
+            if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
+                main_program = program_list[int(
+                    os.getenv("PADDLE_MANUAL_PIPELINE_STAGE"))]
+            else:
+                main_program = program_list[self.pp_rank]
             with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
                 f.writelines(str(main_program))
             main_block = main_program.global_block()
@@ -633,14 +644,15 @@ def _init_pair_comm(self, pair, ring_id):
             self.pp_group_endpoints[pair[1]],
         ]
         pp_rank = 0 if self.pp_rank == pair[0] else 1
-        self._collective_helper._init_communicator(
-            self._startup_program,
-            self.current_endpoint,
-            pp_group_endpoints,
-            pp_rank,
-            ring_id,
-            False,
-            sync=False)
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None) is None:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                pp_group_endpoints,
+                pp_rank,
+                ring_id,
+                False,
+                sync=False)
 
     def _init_npu_pipeline_comm(self, startup_block):
         # NOTE(wangxi): some bug with hccl, must set pp_degree be even number
@@ -714,14 +726,15 @@ def _init_npu_pipeline_comm(self, startup_block):
 
     def _init_pipeline_comm(self, startup_block):
         # TODO (JZ-LIANG) to unify pp_rank_ and pp_rank
-        self._collective_helper._init_communicator(
-            self._startup_program,
-            self.current_endpoint,
-            self.pp_group_endpoints,
-            self.pp_rank,
-            self.pp_ring_id,
-            False,
-            sync=False)
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None) is None:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.pp_group_endpoints,
+                self.pp_rank,
+                self.pp_ring_id,
+                False,
+                sync=False)
 
         if core.is_compiled_with_npu():
             self._init_npu_pipeline_comm(startup_block)
@@ -1387,17 +1400,27 @@ def _build_groups(self):
         # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism
         # e.g. mp-sharding-pp-dp
         # sharding-hybrid-dp as one senario of outter-pure-dp 
-        assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format(
-            self.mp_degree, self.sharding_degree, self.pp_degree,
-            self.dp_degree, self.global_word_size)
+        local_pp_degree = self.pp_degree
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
+            assert self.pp_degree == 2, ("For manually set pipeline, only "
+                                         "pp_degree = 2 is supported.")
+            assert self.global_word_size == self.mp_degree * self.sharding_degree * self.dp_degree, \
+                "global work size [{}], mp_degree [{}], sharding_degree [{}], dp_degree [{}].".format(
+                    self.global_word_size, self.mp_degree, self.sharding_degree, self.dp_degree)
+            local_pp_degree = 1
+        else:
+            assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format(
+                self.mp_degree, self.sharding_degree, self.pp_degree,
+                self.dp_degree, self.global_word_size)
 
         if self.dp_degree > 1:
             self.dp_ring_id = 2
-            self.dp_rank = self.global_rank // (self.sharding_degree *
-                                                self.mp_degree * self.pp_degree)
+            self.dp_rank = self.global_rank // (
+                self.sharding_degree * self.mp_degree * local_pp_degree)
             dp_first_rank_idx = self.global_rank % (
-                self.sharding_degree * self.mp_degree * self.pp_degree)
-            dp_offset = (self.sharding_degree * self.mp_degree * self.pp_degree)
+                self.sharding_degree * self.mp_degree * local_pp_degree)
+            dp_offset = (self.sharding_degree * self.mp_degree *
+                         local_pp_degree)
             self.dp_group_endpoints = []
             for i in range(self.dp_degree):
                 self.dp_group_endpoints.append(self.global_endpoints[
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 59bcf50ffb798..6c8badd64e161 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -19,7 +19,7 @@
 from paddle import _C_ops
 from paddle.autograd import PyLayer, EagerPyLayer
 from paddle.fluid import framework
-from ...utils.recompute import check_recompute_necessary, detach_variable
+from ...utils.recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker
 from ..parallel_layers.random import get_rng_state_tracker
 from paddle.fluid.framework import in_dygraph_mode
 
@@ -151,20 +151,6 @@ def _merge_activation(tensor):
     return _all_gather(tensor, group=mp_group)
 
 
-@contextlib.contextmanager
-def _swith_rng_state_tracker(rng_state, tracker):
-    orig_cuda_rng_state = paddle.get_cuda_rng_state()
-    orig_cuda_rng_tracker = get_rng_state_tracker().get_states_tracker()
-
-    paddle.set_cuda_rng_state(rng_state)
-    get_rng_state_tracker().set_states_tracker(tracker)
-    try:
-        yield
-    finally:
-        paddle.set_cuda_rng_state(orig_cuda_rng_state)
-        get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker)
-
-
 class _HPEagerRecomputeFunction(EagerPyLayer):
     """
     Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
@@ -261,8 +247,8 @@ def backward(ctx, *args):
             tracer._has_grad = True
 
             # need restore auto_cast state as well as w/b list
-            with _swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
-                                          ctx.fwd_cuda_rng_state_tracker):
+            with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
+                                         ctx.fwd_cuda_rng_state_tracker):
                 with paddle.amp.auto_cast(
                         enable=ctx.is_fw_autocast,
                         custom_white_list=ctx.amp_white_list,
@@ -393,8 +379,8 @@ def backward(ctx, *args):
             tracer._has_grad = True
 
             # need restore auto_cast state as well as w/b list
-            with _swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
-                                          ctx.fwd_cuda_rng_state_tracker):
+            with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
+                                         ctx.fwd_cuda_rng_state_tracker):
                 with paddle.amp.auto_cast(
                         enable=ctx.is_fw_autocast,
                         custom_white_list=ctx.amp_white_list,
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index c767be77d8384..b8d1c881a08f9 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -53,18 +53,24 @@ def check_recompute_necessary(inputs):
 
 
 @contextlib.contextmanager
-def swith_rng_state(rng_state):
+def swith_rng_state_tracker(rng_state, tracker):
+    from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
     orig_cuda_rng_state = paddle.get_cuda_rng_state()
+    orig_cuda_rng_tracker = get_rng_state_tracker().get_states_tracker()
+
     paddle.set_cuda_rng_state(rng_state)
+    get_rng_state_tracker().set_states_tracker(tracker)
     try:
         yield
     finally:
         paddle.set_cuda_rng_state(orig_cuda_rng_state)
+        get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker)
 
 
 class EagerRecomputeFunction(EagerPyLayer):
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         if framework._dygraph_tracer()._has_grad:
             check_recompute_necessary(args)
 
@@ -98,6 +104,8 @@ def forward(ctx, run_function, preserve_rng_state, *args):
                     "Recompute with RNG perserve is not support current device: {}.".
                     format(cur_device))
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
+            ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
+            ).get_states_tracker()
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
@@ -126,6 +134,7 @@ def forward(ctx, run_function, preserve_rng_state, *args):
 
     @staticmethod
     def backward(ctx, *args):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         with paddle.fluid.dygraph.guard():
             # TODO need to check the recompute calling is vaild or not
 
@@ -143,7 +152,8 @@ def backward(ctx, *args):
             # NOTE support AMP
             # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
-                with swith_rng_state(ctx.fw_cuda_rng_state):
+                with swith_rng_state_tracker(ctx.fw_cuda_rng_state,
+                                             ctx.fwd_cuda_rng_state_tracker):
                     with paddle.amp.auto_cast(
                             enable=ctx.is_fw_autocast,
                             custom_white_list=ctx.amp_white_list,
@@ -199,6 +209,7 @@ def backward(ctx, *args):
 class RecomputeFunction(PyLayer):
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         if framework._dygraph_tracer()._has_grad:
             check_recompute_necessary(args)
 
@@ -232,6 +243,8 @@ def forward(ctx, run_function, preserve_rng_state, *args):
                     "Recompute with RNG perserve is not support current device: {}.".
                     format(cur_device))
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
+            ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
+            ).get_states_tracker()
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
@@ -260,6 +273,7 @@ def forward(ctx, run_function, preserve_rng_state, *args):
 
     @staticmethod
     def backward(ctx, *args):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         with paddle.fluid.dygraph.guard():
             # TODO need to check the recompute calling is vaild or not
 
@@ -277,7 +291,8 @@ def backward(ctx, *args):
             # NOTE support AMP
             # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
-                with swith_rng_state(ctx.fw_cuda_rng_state):
+                with swith_rng_state_tracker(ctx.fw_cuda_rng_state,
+                                             ctx.fwd_cuda_rng_state_tracker):
                     with paddle.amp.auto_cast(
                             enable=ctx.is_fw_autocast,
                             custom_white_list=ctx.amp_white_list,
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 69b2237f0ba7d..f069bfbcd3501 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -21,6 +21,7 @@
 from paddle.distributed.launch.job.container import Container
 
 from .master import Master
+from .watcher import Watcher
 
 import time
 
@@ -39,6 +40,8 @@ def __init__(self, ctx):
         self.ctx = ctx
         self.master = Master.factory(self.ctx)
 
+        self.watcher = Watcher(self.ctx)
+
         self.job = Job(nnodes=self.ctx.args.nnodes,
                        mode=self.ctx.args.run_mode,
                        jid=self.ctx.args.job_id)
@@ -114,6 +117,9 @@ def watch(self) -> bool:
 
     def stop(self, sigint=None):
         self.ctx.logger.debug("Controller stop")
+
+        self.watcher.stop()
+
         self.master.stop()
         self.pod.stop(sigint)
 
diff --git a/python/paddle/distributed/launch/controllers/watcher.py b/python/paddle/distributed/launch/controllers/watcher.py
new file mode 100644
index 0000000000000..4d49b924f1e81
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/watcher.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils.nvsmi import get_gpu_process, get_gpu_util, get_gpu_info
+import time
+import os
+
+from threading import Thread
+
+
+class Watcher(object):
+    def __init__(self, ctx):
+        self.ctx = ctx
+
+        self.interval = 10
+
+        self.gpu_util = []
+
+        # gpu log file
+        self.gpus = self.ctx.args.devices or self.ctx.node.device.labels
+        if len(self.gpus) > 0:
+            fn = os.path.join(self.ctx.args.log_dir,
+                              "{}.gpu.log".format(self.ctx.args.job_id))
+            os.makedirs(os.path.dirname(fn), exist_ok=True)
+            self.gpu_fd = open(fn, 'w')
+        else:
+            return
+
+        # start
+        self.proc = Thread(target=self.watch)
+        self.proc.daemon = True
+        self.proc.start()
+
+    def watch(self):
+        if not len(self.gpus) > 0:
+            return
+
+        self._print_gpu_info()
+
+        util_key = "index,utilization_gpu,memory_total,memory_used,memory_free,timestamp"
+        self.gpu_fd.write(util_key)
+        self.gpu_fd.write('\n')
+
+        while not self.ctx.status.is_done():
+            self._save_gpu_log(util_key)
+            time.sleep(self.interval)
+
+        if hasattr(self, "gpu_fd"):
+            self.gpu_fd.close()
+
+    def _print_gpu_info(self):
+        try:
+            info_key = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode"
+            self.gpu_fd.write(info_key)
+            self.gpu_fd.write('\n')
+            for line in get_gpu_info(self.gpus):
+                self.gpu_fd.write(line.str(info_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.write('\n')
+
+            process_key = "pid,process_name,gpu_uuid,gpu_name,used_memory"
+            self.gpu_fd.write(process_key)
+            self.gpu_fd.write('\n')
+            for line in get_gpu_process(self.gpus):
+                self.gpu_fd.write(line.str(process_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.write('\n')
+
+            self.gpu_fd.flush()
+        except:
+            self.ctx.log.error("save gpu info failed")
+
+    def _save_gpu_log(self, util_key):
+        try:
+            for line in get_gpu_util(self.gpus):
+                self.gpu_fd.write(line.str(util_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.flush()
+        except:
+            self.ctx.log.error("save gpu log failed")
+
+    def stop(self):
+        if hasattr(self, "proc"):
+            self.proc.join()
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 400a447260252..b2c87e737c82d 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -54,7 +54,7 @@ def launch():
 
         - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
 
-        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
         - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
 
diff --git a/python/paddle/distributed/launch/utils/nvsmi.py b/python/paddle/distributed/launch/utils/nvsmi.py
new file mode 100644
index 0000000000000..82a23189ac6af
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/nvsmi.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import shlex
+import os
+import json
+import shutil
+
+
+class Info(object):
+    def __repr__(self):
+        return str(self.__dict__)
+
+    def json(self):
+        return json.dumps(self.__dict__)
+
+    def dict(self):
+        return self.__dict__
+
+    def str(self, keys=None):
+        if keys is None:
+            keys = self.__dict__.keys()
+
+        if isinstance(keys, str):
+            keys = keys.split(',')
+
+        values = [str(self.__dict__.get(k, '')) for k in keys]
+        return ",".join(values)
+
+
+def query_smi(query=None, query_type="gpu", index=None, dtype=None):
+    """
+    query_type: gpu/compute
+    """
+
+    if not has_nvidia_smi():
+        return []
+
+    cmd = ["nvidia-smi", "--format=csv,noheader,nounits"]
+    if isinstance(query, list) and query_type == "gpu":
+        cmd.extend(["--query-gpu={}".format(",".join(query))])
+    elif isinstance(query, list) and query_type.startswith("compute"):
+        cmd.extend(["--query-compute-apps={}".format(",".join(query))])
+    else:
+        return
+
+    if isinstance(index, list) and len(index) > 0:
+        cmd.extend(["--id={}".format(",".join(index))])
+    if not isinstance(dtype, list) or len(dtype) != len(query):
+        dtype = [str] * len(query)
+
+    output = subprocess.check_output(cmd, timeout=3)
+    lines = output.decode("utf-8").split(os.linesep)
+    ret = []
+    for line in lines:
+        if not line:
+            continue
+        info = Info()
+        for k, v, d in zip(query, line.split(", "), dtype):
+            setattr(info, k.replace(".", "_"), d(v))
+        ret.append(info)
+    return ret
+
+
+def get_gpu_info(index=None):
+    q = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode".split(
+        ",")
+    d = [int, str, str, str, str, str, str]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, dtype=d)
+
+
+def get_gpu_util(index=None):
+    q = "index,utilization.gpu,memory.total,memory.used,memory.free,timestamp".split(
+        ",")
+    d = [int, int, int, int, int, str]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, dtype=d)
+
+
+def get_gpu_process(index=None):
+    q = "pid,process_name,gpu_uuid,gpu_name,used_memory".split(",")
+    d = [int, str, str, str, int]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, query_type="compute", dtype=d)
+
+
+def has_nvidia_smi():
+    return shutil.which("nvidia-smi")
+
+
+if __name__ == '__main__':
+    print(get_gpu_info(0))
+    print(get_gpu_util(0))
+    print(get_gpu_process(0))
+
+    u = get_gpu_util()
+    for i in u:
+        print(i.str())
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 53d35a251c8c8..8cd6c4647dce4 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -233,8 +233,13 @@ def train():
         master_addr, master_port = endpoints.split(":")
         master_port = int(master_port)
         is_master = rank == 0
-        default_store = core.TCPStore(master_addr, master_port, is_master,
-                                      world_size)
+        stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
+        default_store = core.TCPStore(
+            master_addr,
+            master_port,
+            is_master,
+            world_size,
+            stop_check_timeout=stop_check_timeout)
         _set_default_store(default_store)
         pg = _new_process_group_impl(
             backend,
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 69c3eef7e3771..9dda310e5c022 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -306,7 +306,7 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                     in_var_dist_attr = consume_op_attr.get_input_dist_attr(
                         in_var.name)
                     assert in_var_dist_attr is not None
-                    # truely insert cast op
+                    # truly insert cast op
                     if cast_var is None or cast_var.dtype != dst_dtype:
                         # NOTE we make the cast op and var's dist attr as the op that consume the
                         # cast var instead of the op which generates the var
diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py
index 4a4e5ecbbb495..72525255b7eaa 100644
--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .pass_base import PassType, CPPPassWrapper, register_pass
+from paddle.fluid.framework import core, _apply_pass as _apply_cpp_pass
 
 
 @register_pass("fuse_elewise_add_act")
@@ -93,3 +94,35 @@ def cpp_name(self):
 
     def _type(self):
         return PassType.CALC_OPT
+
+
+@register_pass("build_cinn")
+class BuildCINNPass(CPPPassWrapper):
+    def __init__(self):
+        super(BuildCINNPass, self).__init__()
+        self.set_attr("allow_ops", [])
+        self.set_attr("deny_ops", [])
+
+    @property
+    def cpp_name(self):
+        return "build_cinn_pass"
+
+    def _type(self):
+        return PassType.CALC_OPT
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        allow_ops = ";".join(self.get_attr("allow_ops"))
+        deny_ops = ";".join(self.get_attr("deny_ops"))
+
+        assert 'FLAGS_allow_cinn_ops' in core.globals(
+        ), "PaddlePaddle is not compiled with CINN support"
+        old_allow_ops = core.globals()['FLAGS_allow_cinn_ops']
+        old_deny_ops = core.globals()['FLAGS_deny_cinn_ops']
+        try:
+            core.globals()['FLAGS_allow_cinn_ops'] = allow_ops
+            core.globals()['FLAGS_deny_cinn_ops'] = deny_ops
+            _apply_cpp_pass(main_program, startup_program, self.cpp_name, {},
+                            self.cpp_attr_types)
+        finally:
+            core.globals()['FLAGS_allow_cinn_ops'] = old_allow_ops
+            core.globals()['FLAGS_deny_cinn_ops'] = old_deny_ops
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index e7edc6fd859a6..7acfd6cfe19f5 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -748,7 +748,7 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
 def union_forward_gradient_op(program_block_ops_list):
     """
     before analyzing the input & output of each block in program_block_list, we should
-    union the forward op and corresponding gradient op to elimincate the uneccessary variable
+    union the forward op and corresponding gradient op to elimincate the unnecessary variable
     transmit
     """
     """
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 9d9fbd39a5767..e543bc1e17b2c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -426,6 +426,7 @@ def _optimize_fp32_graph(self, graph):
         graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
         graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_affine_channel_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
         graph = self._apply_pass(graph,
                                  'conv_transpose_eltwiseadd_bn_fuse_pass')
@@ -667,4 +668,5 @@ def _quantize_fp32_graph(self, graph):
             graph, 'cpu_quantize_pass', ['quant_var_scales', 'data_layout'],
             [self._var_quant_scales, self._get_data_layout(graph)])
         graph = self._apply_pass(graph, 'cpu_quantize_squash_pass')
+        graph = self._apply_pass(graph, 'int8_scale_calculation_mkldnn_pass')
         return graph
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 629529ff1b965..56d77f77b5083 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -405,7 +405,7 @@ def test_post_training_abs_max_mobilenetv1(self):
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = False
-        # The accuracy diff of post-traing quantization (abs_max) maybe bigger
+        # The accuracy diff of post-training quantization (abs_max) maybe bigger
         diff_threshold = 0.05
         self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 1ae57bcb30310..ba5c709b1d877 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -257,7 +257,7 @@ def load_dygraph(model_path, **configs):
                     para_dict = structured_para_dict
         else:
             # load state dict by `io.save_params/persistables` save format
-            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # TODO(chenweihang): [ Now only supports loading parameters separately ]
             # If users save all parameters as one file, the [ variable.name -> variable ]
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 3a7b012b02bee..576baf6cc299a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -167,7 +167,7 @@ def convert_logical_not(x):
     A function representation of a Python ``not`` statement.
 
     Args:
-        x(bool|Tensor): Operand of of ``not`` operator.
+        x(bool|Tensor): Operand of ``not`` operator.
 
     Returns:
         A python bool variable or a bool Tensor.
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index c6ff3a583d6a3..164545d0a0595 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -873,7 +873,7 @@ def _get_targets(_optimize_ops, _fetch_list, item):
                 _fetch_list.append(item)
             else:
                 raise TypeError(
-                    "The item in fetch_list should be str, variable or optimize_op, but recieved %s.",
+                    "The item in fetch_list should be str, variable or optimize_op, but received %s.",
                     type(item))
 
         for index, item in enumerate(fetch_list):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 16a5e25472557..6957dd8c5e30c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2864,9 +2864,10 @@ def _to_readable_code(self, skip_op_callstack=True):
                 continue
 
             # it is bytes of serialized protobuf 
-            if self.type == 'cinn_launch' and name == 'compilation_key':
-                # value = core.get_readable_comile_key(self.desc)
-                v = self.desc.attr(name)
+            if is_compiled_with_cinn(
+            ) and self.type == 'cinn_launch' and name == 'compilation_key':
+                key = self.desc.attr(name)
+                v = core.get_serialize_comile_key(key)
                 prog = Program()
                 prog = prog.parse_from_string(v)
                 s = prog._to_readable_code()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 46f26e8e52cd5..2c09abac9e7ba 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -1407,7 +1407,7 @@ def get_communicate_var_info(program,
 def union_forward_gradient_op(program_block_ops_list):
     """
     before analyzing the input & output of each block in program_block_list, we should
-    union the forward op and corresponding gradient op to elimincate the uneccessary variable
+    union the forward op and corresponding gradient op to elimincate the unnecessary variable
     transmit
     """
     """
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index ce6fe6918b56b..47f0c02d28725 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -234,7 +234,7 @@ def __weight_normalize(g, v, dim):
                 x=g, y=norm)  # The shapes of g and norm are the same.
             # Currently, elementwise_mul only support broadcast when the shape
             # of y is a subset of the shape of x. Thus, we reshape y to squeeze
-            # to achive the subset.
+            # to achieve the subset.
             w = elementwise_mul(
                 x=v,
                 y=scale if dim is None else reshape(
diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
index 43eb436f65e78..0b4211cbb63dc 100644
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
@@ -14,7 +14,9 @@
 
 from __future__ import print_function
 from ..layer_helper import LayerHelper, unique_name
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode, _in_legacy_dygraph
+import paddle
+from paddle import _C_ops
 
 
 def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
@@ -107,6 +109,21 @@ def _c_broadcast(x, root=0, ring_id=0, use_calc_stream=False):
 
 def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
     op_type = 'c_allgather'
+
+    if in_dygraph_mode():
+        group = paddle.distributed.collective._get_default_group()
+        tensor_shape = list(x.shape)
+        tensor_shape[0] *= nranks
+        out = paddle.empty(tensor_shape, x.dtype)
+        task = group.process_group.all_gather(x, out)
+        task.wait()
+        return out
+
+    if _in_legacy_dygraph():
+        attrs = ('nranks', nranks, 'ring_id', ring_id, 'use_calc_stream',
+                 use_calc_stream)
+        return _C_ops.c_allgather(x, *attrs)
+
     helper = LayerHelper(op_type, **locals())
     out_shape = list(x.shape[:])
     if out_shape[0] > 0:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 200e8feec1e6a..8be719758ef98 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13744,7 +13744,7 @@ def get_tensor_from_selected_rows(x, name=None):
            x.height = 20
            x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
 
-        Ouput is LoDTensor:
+        Output is LoDTensor:
            out.shape = [5, 2]
            out.data = [[1, 1],
                        [2, 2],
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 707a1dc2cbc2f..b04cf90e1d8f9 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -673,7 +673,7 @@ def birnn(cell_fw,
     birnn creates a bidirectional recurrent neural network specified by 
     RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
     (for dygraph mode :code:`cell.forward`) repeatedly until reaches to 
-    the maximum length of `inputs` and then concat the ouputs for both RNNs
+    the maximum length of `inputs` and then concat the outputs for both RNNs
     along the last axis.
 
     Arguments:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a9b1fa6ff0205..b02c154584e9c 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -760,8 +760,14 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             place = _current_expected_place()
             if force_cpu:
                 place = core.CPUPlace()
+            if isinstance(shape, (list, tuple)):
+                for item in shape:
+                    if not isinstance(item, Variable):
+                        shape = list(
+                            map(lambda x: x.numpy().flat[0] if isinstance(x, Variable) else x,
+                                shape))
+                        break
 
-            shape = utils.convert_shape_to_list(shape)
             if not isinstance(dtype, core.VarDesc.VarType):
                 dtype = convert_np_dtype_to_dtype_(dtype)
             out = _C_ops.final_state_full(shape, float(value), dtype, place)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index bb14fb9a86f15..49fb5399d8aec 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -6005,7 +6005,14 @@ def device_cmp(device1, device2):
         for p in program_list:
             self._create_vars(p.global_block(), main_block)
 
-        self.local_rank %= len(device_list)
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
+            self.local_rank = int(os.getenv("PADDLE_MANUAL_PIPELINE_STAGE"))
+            assert self.local_rank < len(device_list), (
+                "Manually specified "
+                "pipeline stage must be less than total number of pipeline "
+                "stages.")
+        else:
+            self.local_rank %= len(device_list)
         # Step3.5: optimize forward send sync_comm to overlap send and recv
         self._optimize_forward_send_sync(program_list[self.local_rank])
 
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index c76b3da7428e3..b4adeb9575af6 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -10,6 +10,11 @@ if(WITH_GPU OR APPLE)
     set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180)
     set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
     set_tests_properties(test_context_pool PROPERTIES TIMEOUT 180)
+    if($ENV{USE_STANDALONE_EXECUTOR})
+        # these test will fail in some server due to PR#42149, temporarily set it use old executor.
+        set_tests_properties(test_custom_relu_op_setup PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+        set_tests_properties(test_custom_relu_model PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    endif()
 endif()
 
 py_test(test_custom_raw_op_kernel_op SRCS test_custom_raw_op_kernel_op.py)
diff --git a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
index a561c845aba2b..ebfaaecd49093 100644
--- a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
@@ -23,6 +23,16 @@ std::vector<paddle::Tensor> PhiLinearForward(const paddle::Tensor& x,
   return {paddle::add(paddle::matmul(x, weight), bias)};
 }
 
+std::vector<paddle::Tensor> PhiLinearBackward(const paddle::Tensor& x,
+                                              const paddle::Tensor& weight,
+                                              const paddle::Tensor& bias,
+                                              const paddle::Tensor& out_grad) {
+  auto x_grad = paddle::matmul(out_grad, weight, false, true);
+  auto weight_grad = paddle::matmul(x, out_grad, true, false);
+  auto bias_grad = paddle::experimental::sum(out_grad, {0});
+  return {x_grad, weight_grad, bias_grad};
+}
+
 std::vector<std::vector<int64_t>> LinearInferShape(
     const std::vector<int64_t>& x_shape,
     const std::vector<int64_t>& weight_shape,
@@ -86,9 +96,14 @@ std::vector<paddle::DataType> LinearInferDtype(
   return {x_dtype};
 }
 
-PD_BUILD_OP(pten_linear)
+PD_BUILD_OP(phi_linear)
     .Inputs({"X", "Weight", "Bias"})
     .Outputs({"Out"})
     .SetKernelFn(PD_KERNEL(PhiLinearForward))
     .SetInferShapeFn(PD_INFER_SHAPE(LinearInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(LinearInferDtype));
+
+PD_BUILD_GRAD_OP(phi_linear)
+    .Inputs({"X", "Weight", "Bias", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X"), paddle::Grad("Weight"), paddle::Grad("Bias")})
+    .SetKernelFn(PD_KERNEL(PhiLinearBackward));
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
index be49513da35dd..fba512d511c36 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
@@ -40,43 +40,56 @@
     verbose=True)
 
 
-def linear_dynamic(func, dtype, np_x, np_weight, np_bias):
-    paddle.set_device("cpu")
-    x = paddle.to_tensor(np_x, dtype=dtype)
-    weight = paddle.to_tensor(np_weight, dtype=dtype)
-    bias = paddle.to_tensor(np_bias, dtype=dtype)
+def linear_dynamic(func, device, dtype, np_x, np_weight, np_bias):
+    paddle.set_device(device)
+    x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+    weight = paddle.to_tensor(np_weight, dtype=dtype, stop_gradient=False)
+    bias = paddle.to_tensor(np_bias, dtype=dtype, stop_gradient=False)
     out = func(x, weight, bias)
-    return out.numpy()
+    out.backward()
+    return out.numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()
 
 
-def linear_static(func, dtype, np_x, np_weight, np_bias):
+def linear_static(func, device, dtype, np_x, np_weight, np_bias):
     paddle.enable_static()
-    paddle.set_device("cpu")
+    paddle.set_device(device)
     with static.scope_guard(static.Scope()):
         with static.program_guard(static.Program()):
-            x = static.data(name="x", shape=np_x.shape, dtype=dtype)
+            x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype)
             weight = static.data(
                 name="weight", shape=np_weight.shape, dtype=dtype)
             bias = static.data(name="bias", shape=np_bias.shape, dtype=dtype)
+            x.stop_gradient = False
+            weight.stop_gradient = False
+            bias.stop_gradient = False
             out = func(x, weight, bias)
+            mean_out = paddle.mean(out)
+            static.append_backward(mean_out)
 
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
-            out_v, = exe.run(static.default_main_program(),
-                             feed={
-                                 "x": np_x.astype(dtype),
-                                 "weight": np_weight.astype(dtype),
-                                 "bias": np_bias.astype(dtype)
-                             },
-                             fetch_list=[out.name])
+            out_v, x_grad_v, weight_grad_v, bias_grad_v = exe.run(
+                static.default_main_program(),
+                feed={
+                    "x": np_x.astype(dtype),
+                    "weight": np_weight.astype(dtype),
+                    "bias": np_bias.astype(dtype)
+                },
+                fetch_list=[
+                    out.name, x.name + "@GRAD", weight.name + "@GRAD",
+                    bias.name + "@GRAD"
+                ])
     paddle.disable_static()
-    return out_v
+    return out_v, x_grad_v, weight_grad_v, bias_grad_v
 
 
 class TestCustomLinearJit(unittest.TestCase):
     def setUp(self):
         self.dtypes = ['float32', 'float64']
+        self.devices = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            self.devices.append('gpu')
         self.np_x = np.random.random((3, 2)).astype("float32")
         self.np_weight = np.full([2, 4], fill_value=0.5, dtype="float32")
         self.np_bias = np.ones([4], dtype="float32")
@@ -88,20 +101,34 @@ def check_output(self, out, pd_out, name):
                                                            pd_out))
 
     def test_static(self):
-        for dtype in self.dtypes:
-            pten_out = linear_static(custom_ops.pten_linear, dtype, self.np_x,
-                                     self.np_weight, self.np_bias)
-            pd_out = linear_static(F.linear, dtype, self.np_x, self.np_weight,
-                                   self.np_bias)
-            self.check_output(pten_out, pd_out, "pten_out")
+        for device in self.devices:
+            for dtype in self.dtypes:
+                phi_out, phi_x_grad, phi_weight_grad, phi_bias_grad = linear_static(
+                    custom_ops.phi_linear, device, dtype, self.np_x,
+                    self.np_weight, self.np_bias)
+                pd_out, pd_x_grad, pd_weight_grad, pd_bias_grad = linear_static(
+                    F.linear, device, dtype, self.np_x, self.np_weight,
+                    self.np_bias)
+                self.check_output(phi_out, pd_out, "out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_weight_grad, pd_weight_grad,
+                                  "weight_grad")
+                self.check_output(phi_bias_grad, pd_bias_grad, "bias_grad")
 
     def func_dynamic(self):
-        for dtype in self.dtypes:
-            pten_out = linear_dynamic(custom_ops.pten_linear, dtype, self.np_x,
-                                      self.np_weight, self.np_bias)
-            pd_out = linear_dynamic(F.linear, dtype, self.np_x, self.np_weight,
-                                    self.np_bias)
-            self.check_output(pten_out, pd_out, "pten_out")
+        for device in self.devices:
+            for dtype in self.dtypes:
+                phi_out, phi_x_grad, phi_weight_grad, phi_bias_grad = linear_dynamic(
+                    custom_ops.phi_linear, device, dtype, self.np_x,
+                    self.np_weight, self.np_bias)
+                pd_out, pd_x_grad, pd_weight_grad, pd_bias_grad = linear_dynamic(
+                    F.linear, device, dtype, self.np_x, self.np_weight,
+                    self.np_bias)
+                self.check_output(phi_out, pd_out, "phi_out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_weight_grad, pd_weight_grad,
+                                  "weight_grad")
+                self.check_output(phi_bias_grad, pd_bias_grad, "bias_grad")
 
     def test_dynamic(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
index 5664c00d74f89..3b3a0e2edec98 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
@@ -21,8 +21,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
-from paddle.fluid.framework import _test_eager_guard, _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import _test_eager_guard
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -64,7 +63,7 @@ def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.devices = ['cpu']
 
-    def test_func_double_grad_dynamic(self):
+    def func_double_grad_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
@@ -85,6 +84,11 @@ def test_func_double_grad_dynamic(self):
                     "custom op out grad: {},\n paddle api out grad: {}".format(
                         dout, pd_dout))
 
+    def test_func_double_grad_dynamic(self):
+        with _test_eager_guard():
+            self.func_double_grad_dynamic()
+        self.func_double_grad_dynamic()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 08e24f86a29a4..6e80e142c4b85 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1036,6 +1036,7 @@ set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIME
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
+set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES TIMEOUT 120)
@@ -1148,7 +1149,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
     set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200)
     set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
@@ -1233,9 +1234,6 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
-set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000)
-set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-set_tests_properties(test_tensordot PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=False")
 set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
 if (WITH_GLOO)
     set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30)
@@ -1243,3 +1241,14 @@ if (WITH_GLOO)
     set_tests_properties(test_parallel_dygraph_sparse_embedding_gloo PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height_gloo PROPERTIES TIMEOUT 120)
 endif()
+
+if($ENV{USE_STANDALONE_EXECUTOR})
+    # these test will fail in some server due to PR#42149, temporarily set it use old executor.
+    set_tests_properties(test_apply_pass_to_program PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_imperative_optimizer PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_switch_autotune PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_imperative_mnist_sorted_gradient PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 7c747338593a3..1f846f5d7361c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -29,4 +29,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_dist_pnorm MODULES test_dist_pnorm ENVS ${dist_ENVS})
     py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS})
     py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
+    py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
index 9a9efe7ab2dd0..3f8283866768e 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
@@ -23,6 +23,9 @@
 from paddle.incubate.autograd import Hessian
 from paddle.distributed.auto_parallel.engine import Engine
 
+np.random.seed(1234)
+paddle.seed(1234)
+
 
 class FCNet:
     def __init__(self, num_ins, num_outs, num_layers, hidden_size):
@@ -136,10 +139,8 @@ def main():
         inputs_spec=inputs_spec,
         labels_spec=labels_spec,
         strategy=dist_strategy)
-    paddle.seed(1234 + engine._cur_rank)
     engine.prepare(optimizer=optimizer, loss=loss_func)
     res = engine.fit(train_dataset, sample_generator=False)
-    assert np.allclose(res[-1], 2.840593)
 
     dist_context = engine.dist_context
     block = engine.main_program.global_block()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
new file mode 100644
index 0000000000000..f0ad1f4ed314d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import json
+
+import paddle
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cost import CommContext
+from paddle.distributed.auto_parallel.cost import build_comm_desc
+from paddle.distributed.auto_parallel.cost import AllreduceSumOpCost
+from paddle.distributed.auto_parallel.cost import AllgatherOpCost
+from paddle.distributed.auto_parallel.cost import BroadcastOpCost
+from paddle.distributed.auto_parallel.cost import SendOpCost
+from paddle.distributed.auto_parallel.cost import RecvOpCost
+from paddle.distributed.auto_parallel.cost import IdentityOpCost
+
+from test_cluster import cluster_json, multi_cluster_json
+
+
+class TestCommOpCost(unittest.TestCase):
+    def test_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
+
+        # Check AllreduceSumCost 128MB ring cost
+        allreduce_sum_op_desc = build_comm_desc(
+            "c_allreduce_sum", [0, 1, 2, 3, 4, 5, 6, 7], paddle.float32,
+            [1, 32 * (10**6)])
+        allreduce_sum_op_cost = AllreduceSumOpCost(
+            op_desc=allreduce_sum_op_desc, comm_context=comm_context)
+
+        # Check AllgatherOpCost cost
+        allgather_op_desc = build_comm_desc("c_allgather",
+                                            [0, 1, 2, 3, 4, 5, 6, 7],
+                                            paddle.float32, [1, 32 * (10**6)])
+        allgather_op_cost = AllgatherOpCost(
+            op_desc=allgather_op_desc, comm_context=comm_context)
+        self.assertTrue(allgather_op_cost.time > 0)
+
+        # Check BroadcastOpCost cost
+        broadcast_op_desc = build_comm_desc("c_broadcast",
+                                            [0, 1, 2, 3, 4, 5, 6, 7],
+                                            paddle.float32, [1, 32 * (10**6)])
+        broadcast_op_cost = BroadcastOpCost(
+            op_desc=broadcast_op_desc, comm_context=comm_context)
+        self.assertTrue(broadcast_op_cost.time > 0)
+
+        # Check SendOpCost cost
+        send_op_desc = build_comm_desc("send_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        send_op_cost = SendOpCost(
+            op_desc=send_op_desc, comm_context=comm_context)
+        self.assertTrue(send_op_cost.time > 0)
+
+        # Check RecvOpCost cost
+        recv_op_desc = build_comm_desc("recv_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        recv_op_cost = RecvOpCost(
+            op_desc=recv_op_desc, comm_context=comm_context)
+        self.assertTrue(recv_op_cost.time > 0)
+
+        # Check IdentityOpCost cost
+        identity_op_desc = build_comm_desc("c_identity", [0, 1], paddle.float32,
+                                           [1, 32 * (10**6)])
+        identity_op_cost = IdentityOpCost(
+            op_desc=identity_op_desc, comm_context=comm_context)
+        self.assertTrue(identity_op_cost.time >= 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+    def test_cross_machine_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(multi_cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
+
+        # Check AllreduceSumCost 128MB ring cost
+        allreduce_sum_op_desc = build_comm_desc(
+            "c_allreduce_sum",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        allreduce_sum_op_cost = AllreduceSumOpCost(
+            op_desc=allreduce_sum_op_desc, comm_context=comm_context)
+
+        # Check AllgatherOpCost cost
+        allgather_op_desc = build_comm_desc(
+            "c_allgather",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        allgather_op_cost = AllgatherOpCost(
+            op_desc=allgather_op_desc, comm_context=comm_context)
+        self.assertTrue(allgather_op_cost.time > 0)
+
+        # Check BroadcastOpCost cost
+        broadcast_op_desc = build_comm_desc(
+            "c_broadcast",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        broadcast_op_cost = BroadcastOpCost(
+            op_desc=broadcast_op_desc, comm_context=comm_context)
+        self.assertTrue(broadcast_op_cost.time > 0)
+
+        # Check SendOpCost cost
+        send_op_desc = build_comm_desc("send_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        send_op_cost = SendOpCost(
+            op_desc=send_op_desc, comm_context=comm_context)
+        self.assertTrue(send_op_cost.time > 0)
+
+        # Check RecvOpCost cost
+        recv_op_desc = build_comm_desc("recv_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        recv_op_cost = RecvOpCost(
+            op_desc=recv_op_desc, comm_context=comm_context)
+        self.assertTrue(recv_op_cost.time > 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
index 0914126feb852..aa0bf719fab29 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -79,7 +79,6 @@ def parallelizer(program_func, rank):
 
 class TestDistSlice(unittest.TestCase):
     def test_dist_slice_dp2(self):
-
         for rank in range(2):
             dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
             ops = dist_main_prog.global_block().ops
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
index 0cd3041ea4d25..c0df01ada58f9 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 import unittest
+import os
+import json
 
 import paddle
 import paddle.distributed.auto_parallel.cost as cost_model
 from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc
 from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str
-from paddle.distributed.auto_parallel.cost.base_cost import calc_time_from_model
+from paddle.distributed.auto_parallel.cost.base_cost import calc_time_by_modeling
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cost import CommContext
+from test_cluster import cluster_json, multi_cluster_json
 
 paddle.enable_static()
 
@@ -45,26 +50,44 @@ def test_comp_cost(self):
             if op.type == "matmul_v2":
                 matmul_v2_op = op
                 break
-        matmul_v2_cost = cost_model.OP_COST_FACTORY["matmul_v2"](
+        matmul_v2_cost = cost_model._g_op_cost_factory["matmul_v2"](
             op=matmul_v2_op)
         desc = parse_to_desc(op=matmul_v2_op)
         desc_str = parse_desc_to_str(desc)
         self.assertIsNotNone(desc_str)
         self.assertTrue(check_cost(matmul_v2_cost.cost))
-        time = calc_time_from_model(op=matmul_v2_op)
+        time = calc_time_by_modeling(op=matmul_v2_op)
         self.assertEqual(time, matmul_v2_cost.cost.time)
         tensor_cost = cost_model.TensorCost(tensor=x)
         # check memory
         self.assertEqual(tensor_cost.cost.memory, 1600)
 
     def test_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
         desc = {}
         desc["op"] = "c_allreduce_sum"
-        desc["inputs"] = {"X": [([100, 200], paddle.float32)]}
-        allreduce_cost = cost_model.OP_COST_FACTORY["c_allreduce_sum"](
-            op_desc=desc)
+        desc["inputs"] = {"X": [(paddle.float32, [100, 200])]}
+        desc["group_ranks"] = [0, 1]
+        allreduce_cost = cost_model._g_op_cost_factory["c_allreduce_sum"](
+            op_desc=desc, comm_context=CommContext(cluster))
         self.assertTrue(check_cost(allreduce_cost.cost))
 
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
     def test_cost_estimator(self):
         train_program = paddle.static.Program()
         cost_estimator = cost_model.CostEstimator(train_program)
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
index f4217d11f2d9b..dee74fdcb1ff3 100644
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
@@ -25,8 +25,7 @@
 
 import paddle
 import paddle.nn as nn
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import _test_eager_guard
 
 np.random.seed(0)
 
@@ -94,7 +93,7 @@ def check(use_cuda):
         sgd.clear_grad()
 
 
-if __name__ == '__main__':
+def run_check():
     if paddle.is_compiled_with_cuda():
         try:
             check(use_cuda=True)
@@ -112,3 +111,9 @@ def check(use_cuda):
         print(e)
         print(type(e))
         assert type(e) == RuntimeError
+
+
+if __name__ == '__main__':
+    with _test_eager_guard():
+        run_check()
+    run_check()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
index f0ed2cdc04950..786ee06487fbc 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
@@ -39,7 +39,7 @@ def prepare_python_path_and_return_module(path):
             paths.append(dirname)
         python_path = ":".join(paths)
     else:
-        python_path = path
+        python_path = dirname
     os.environ[env_name] = python_path
     print('GLOG_v=', os.environ.get('GLOG_v', None), flush=1)
     return filename[:-len(py_suffix)]
@@ -85,9 +85,9 @@ def apply_passes(self, main_prog, startup_prog):
         raise NotImplementedError()
 
     def check_main(self, model=None, gpus=None, **kwargs):
-        no_pass_rets = self._distributed_launch(
-            model=model, apply_pass=True, gpus=gpus, **kwargs)
         pass_rets = self._distributed_launch(
+            model=model, apply_pass=True, gpus=gpus, **kwargs)
+        no_pass_rets = self._distributed_launch(
             model=model, apply_pass=False, gpus=gpus, **kwargs)
         self.check_results(no_pass_rets, pass_rets)
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py b/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
index 0b522b79c4e93..7eebee47e59a8 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
@@ -59,3 +59,40 @@ def reader():
     main_program = paddle.static.default_main_program()
     startup_program = paddle.static.default_startup_program()
     return main_program, startup_program, [image, label], [loss], reader
+
+
+def simple_net(place, batch_size, image_shape=[784], num_classes=10):
+    image = paddle.static.data(
+        shape=[batch_size] + image_shape, dtype='float32', name='image')
+    label = paddle.static.data(
+        shape=[batch_size, 1], dtype='int64', name='label')
+    linears = [nn.Linear(784, 784) for _ in range(3)]
+    hidden = image
+    for linear in linears:
+        hidden = linear(hidden)
+        hidden = nn.ReLU()(hidden)
+    loss_fn = nn.loss.CrossEntropyLoss()
+    loss = loss_fn(hidden, label)
+    optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.fuse_all_reduce_ops = False
+    dist_strategy.without_graph_optimization = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+    optimizer = fleet.distributed_optimizer(optimizer)
+    optimizer.minimize(loss)
+
+    rank = paddle.distributed.get_rank()
+
+    def reader():
+        seed = get_seed_from_env()
+        np.random.seed(seed + rank)
+        for _ in range(10):
+            image_np = np.random.random(size=image.shape).astype('float32')
+            label_np = np.random.randint(
+                low=0, high=num_classes, size=label.shape).astype('int64')
+            yield image_np, label_np
+
+    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.default_startup_program()
+    return main_program, startup_program, [image, label], [loss], reader
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
new file mode 100644
index 0000000000000..8430eb615a20c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed.passes import new_pass, PassManager
+import unittest
+from dist_pass_test_base import DistPassTestBase
+from model_zoo import resnet_model
+
+
+class TestBuildCINNPass(DistPassTestBase):
+    def init(self):
+        self.atol = 0.5
+        self.rtol = 0.0
+
+    def apply_passes(self, main_prog, startup_prog):
+        pass_manager = PassManager([
+            new_pass("build_cinn"),
+            new_pass("fuse_elewise_add_act"),
+        ])
+        pass_manager.apply([main_prog], [startup_prog])
+        print(pass_manager.names)
+
+    def test_bs_32(self):
+        if paddle.is_compiled_with_cinn():
+            self.check_main(resnet_model, batch_size=32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
new file mode 100644
index 0000000000000..e030420d32420
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed.passes import new_pass, PassManager
+import unittest
+from dist_pass_test_base import DistPassTestBase
+from model_zoo import simple_net
+
+
+class TestBuildCINNPass(DistPassTestBase):
+    def init(self):
+        self.atol = 0.0
+        self.rtol = 0.0
+
+    def apply_passes(self, main_prog, startup_prog):
+        pass_manager = PassManager([
+            new_pass("build_cinn"),
+            new_pass("fuse_elewise_add_act"),
+        ])
+        pass_manager.apply([main_prog], [startup_prog])
+        op_types = [op.type for op in main_prog.global_block().ops]
+        self.assertTrue('cinn_launch' in op_types)
+
+    def test_bs_32(self):
+        if paddle.is_compiled_with_cinn():
+            self.check_main(simple_net, batch_size=32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
index 574a222ba18c9..a1a853f006c0d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
@@ -32,7 +32,6 @@
 momentum_rate = 0.9
 l2_decay = 1e-4
 batch_size = 100
-fleet.init(is_collective=True)
 
 
 class MLP(fluid.Layer):
@@ -147,4 +146,5 @@ def test_sharding_api():
 if __name__ == '__main__':
     with _test_eager_guard():
         pass
+    fleet.init(is_collective=True)
     test_sharding_api()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py
index b1f885e8cffe6..8c07734d513c4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2.py
@@ -26,6 +26,7 @@
 from paddle.fluid.dygraph.nn import Linear
 from paddle.distributed import fleet
 from paddle.fluid.dygraph import nn
+from paddle.fluid.framework import _test_eager_guard
 
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
@@ -224,4 +225,5 @@ def test_dp_stage2():
 
 
 if __name__ == '__main__':
-    test_dp_stage2()
+    with _test_eager_guard():
+        test_dp_stage2()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py
index 360992a067f02..b09314ae9e31c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_stage2_offload.py
@@ -23,6 +23,7 @@
 from paddle.fluid.dygraph.nn import Linear
 from paddle.distributed import fleet
 from paddle.fluid.dygraph import nn
+from paddle.fluid.framework import _test_eager_guard
 
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
@@ -107,4 +108,5 @@ def test_sharding_stage2_offload():
 
 
 if __name__ == '__main__':
-    test_sharding_stage2_offload()
+    with _test_eager_guard():
+        test_sharding_stage2_offload()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
index 705831d50f171..0ed9b681fdcf5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
@@ -23,6 +23,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.nn import Linear
 from paddle.distributed import fleet
+from paddle.fluid.framework import _test_eager_guard
 
 from paddle.distributed.fleet.utils.internal_storage import GradStorage
 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
@@ -138,4 +139,6 @@ def train_mlp():
 
 
 if __name__ == '__main__':
+    with _test_eager_guard():
+        pass
     train_mlp()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 82edd1c17a541..58432540d1b82 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -42,7 +42,6 @@
     "pp_degree": 1,
     "sharding_degree": 1
 }
-fleet.init(is_collective=True, strategy=strategy)
 
 np.random.seed(seed)
 paddle.seed(seed)
@@ -225,4 +224,5 @@ def test_dp_stage2():
 if __name__ == '__main__':
     with _test_eager_guard():
         pass
+    fleet.init(is_collective=True, strategy=strategy)
     test_dp_stage2()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
index a7b16bbb75977..cd2d7b3f12765 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
@@ -36,6 +36,14 @@
 batch_size = 32
 linear_size = 1000
 
+strategy = fleet.DistributedStrategy()
+strategy.hybrid_configs = {
+    "dp_degree": 2,
+    "mp_degree": 1,
+    "pp_degree": 1,
+    "sharding_degree": 1
+}
+
 np.random.seed(seed)
 paddle.seed(seed)
 
@@ -109,4 +117,5 @@ def test_sharding_stage2_offload():
 if __name__ == '__main__':
     with _test_eager_guard():
         pass
+    fleet.init(is_collective=True, strategy=strategy)
     test_sharding_stage2_offload()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index 82821cd7ee644..fc4002ef405bd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -26,6 +26,7 @@
 from paddle.fluid.dygraph.nn import Linear
 from paddle.distributed import fleet
 from paddle.fluid.dygraph import nn
+from paddle.fluid.framework import _test_eager_guard
 
 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
@@ -38,7 +39,6 @@
 base_lr = 0.1
 momentum_rate = 0.9
 l2_decay = 1e-4
-fleet.init(is_collective=True)
 
 
 class MLP(fluid.Layer):
@@ -274,4 +274,7 @@ def test_stage2_stage3():
 
 
 if __name__ == '__main__':
+    with _test_eager_guard():
+        pass
+    fleet.init(is_collective=True)
     test_stage2_stage3()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
index df7ba78d345a3..763a7a8b97fdd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
@@ -23,6 +23,7 @@
 from paddle.fluid.dygraph.nn import Linear
 from paddle.distributed import fleet
 from paddle.fluid.dygraph import nn
+from paddle.fluid.framework import _test_eager_guard
 
 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
 from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
@@ -33,7 +34,6 @@
 base_lr = 0.1
 momentum_rate = 0.9
 l2_decay = 1e-4
-fleet.init(is_collective=True)
 
 
 class MLP(fluid.Layer):
@@ -196,4 +196,7 @@ def test_stage3_offload():
 
 
 if __name__ == '__main__':
+    with _test_eager_guard():
+        pass
+    fleet.init(is_collective=True)
     test_stage3_offload()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index a9e94ef09b9ac..db533e6379add 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -14,6 +14,7 @@
 
 import os
 import time
+import tempfile
 import unittest
 import numpy as np
 
@@ -33,32 +34,118 @@
 SEED = 2020
 STEP_NUM = 10
 PRINT_STEP = 2
-MODEL_SAVE_DIR = "./inference"
-MODEL_SAVE_PREFIX = "./inference/bert"
-MODEL_FILENAME = "bert" + INFER_MODEL_SUFFIX
-PARAMS_FILENAME = "bert" + INFER_PARAMS_SUFFIX
-DY_STATE_DICT_SAVE_PATH = "./bert.dygraph"
-
-
-def train(bert_config, data_reader, to_static):
-    with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
-
-        data_loader = fluid.io.DataLoader.from_generator(
-            capacity=50, iterable=True)
-        data_loader.set_batch_generator(
-            data_reader.data_generator(), places=place)
-
-        bert = PretrainModelLayer(
-            config=bert_config, weight_sharing=False, use_fp16=False)
-
-        optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters())
-        step_idx = 0
-        speed_list = []
-        for input_data in data_loader():
-            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_data
-            next_sent_acc, mask_lm_loss, total_loss = bert(
+
+
+class TestBert(unittest.TestCase):
+    def setUp(self):
+        self.bert_config = get_bert_config()
+        self.data_reader = get_feed_data_reader(self.bert_config)
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+        self.model_save_prefix = os.path.join(self.model_save_dir, 'bert')
+        self.model_filename = 'bert' + INFER_MODEL_SUFFIX
+        self.params_filename = 'bert' + INFER_PARAMS_SUFFIX
+        self.dy_state_dict_save_path = os.path.join(self.temp_dir.name,
+                                                    'bert.dygraph')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def train(self, bert_config, data_reader, to_static):
+        with fluid.dygraph.guard(place):
+            fluid.default_main_program().random_seed = SEED
+            fluid.default_startup_program().random_seed = SEED
+
+            data_loader = fluid.io.DataLoader.from_generator(
+                capacity=50, iterable=True)
+            data_loader.set_batch_generator(
+                data_reader.data_generator(), places=place)
+
+            bert = PretrainModelLayer(
+                config=bert_config, weight_sharing=False, use_fp16=False)
+
+            optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters())
+            step_idx = 0
+            speed_list = []
+            for input_data in data_loader():
+                src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_data
+                next_sent_acc, mask_lm_loss, total_loss = bert(
+                    src_ids=src_ids,
+                    position_ids=pos_ids,
+                    sentence_ids=sent_ids,
+                    input_mask=input_mask,
+                    mask_label=mask_label,
+                    mask_pos=mask_pos,
+                    labels=labels)
+                total_loss.backward()
+                optimizer.minimize(total_loss)
+                bert.clear_gradients()
+
+                acc = np.mean(np.array(next_sent_acc.numpy()))
+                loss = np.mean(np.array(total_loss.numpy()))
+                ppl = np.mean(np.exp(np.array(mask_lm_loss.numpy())))
+
+                if step_idx % PRINT_STEP == 0:
+                    if step_idx == 0:
+                        print("Step: %d, loss: %f, ppl: %f, next_sent_acc: %f" %
+                              (step_idx, loss, ppl, acc))
+                        avg_batch_time = time.time()
+                    else:
+                        speed = PRINT_STEP / (time.time() - avg_batch_time)
+                        speed_list.append(speed)
+                        print(
+                            "Step: %d, loss: %f, ppl: %f, next_sent_acc: %f, speed: %.3f steps/s"
+                            % (step_idx, loss, ppl, acc, speed))
+                        avg_batch_time = time.time()
+
+                step_idx += 1
+                if step_idx == STEP_NUM:
+                    if to_static:
+                        fluid.dygraph.jit.save(bert, self.model_save_prefix)
+                    else:
+                        fluid.dygraph.save_dygraph(bert.state_dict(),
+                                                   self.dy_state_dict_save_path)
+                    break
+            return loss, ppl
+
+    def train_dygraph(self, bert_config, data_reader):
+        program_translator.enable(False)
+        return self.train(bert_config, data_reader, False)
+
+    def train_static(self, bert_config, data_reader):
+        program_translator.enable(True)
+        return self.train(bert_config, data_reader, True)
+
+    def predict_static(self, data):
+        paddle.enable_static()
+        exe = fluid.Executor(place)
+        # load inference model
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.model_save_dir,
+             executor=exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+        pred_res = exe.run(inference_program,
+                           feed=dict(zip(feed_target_names, data)),
+                           fetch_list=fetch_targets)
+
+        return pred_res
+
+    def predict_dygraph(self, bert_config, data):
+        program_translator.enable(False)
+        with fluid.dygraph.guard(place):
+            bert = PretrainModelLayer(
+                config=bert_config, weight_sharing=False, use_fp16=False)
+            model_dict, _ = fluid.dygraph.load_dygraph(
+                self.dy_state_dict_save_path)
+
+            bert.set_dict(model_dict)
+            bert.eval()
+
+            input_vars = [fluid.dygraph.to_variable(x) for x in data]
+            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_vars
+            pred_res = bert(
                 src_ids=src_ids,
                 position_ids=pos_ids,
                 sentence_ids=sent_ids,
@@ -66,120 +153,33 @@ def train(bert_config, data_reader, to_static):
                 mask_label=mask_label,
                 mask_pos=mask_pos,
                 labels=labels)
-            total_loss.backward()
-            optimizer.minimize(total_loss)
-            bert.clear_gradients()
-
-            acc = np.mean(np.array(next_sent_acc.numpy()))
-            loss = np.mean(np.array(total_loss.numpy()))
-            ppl = np.mean(np.exp(np.array(mask_lm_loss.numpy())))
-
-            if step_idx % PRINT_STEP == 0:
-                if step_idx == 0:
-                    print("Step: %d, loss: %f, ppl: %f, next_sent_acc: %f" %
-                          (step_idx, loss, ppl, acc))
-                    avg_batch_time = time.time()
-                else:
-                    speed = PRINT_STEP / (time.time() - avg_batch_time)
-                    speed_list.append(speed)
-                    print(
-                        "Step: %d, loss: %f, ppl: %f, next_sent_acc: %f, speed: %.3f steps/s"
-                        % (step_idx, loss, ppl, acc, speed))
-                    avg_batch_time = time.time()
-
-            step_idx += 1
-            if step_idx == STEP_NUM:
-                if to_static:
-                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PREFIX)
-                else:
-                    fluid.dygraph.save_dygraph(bert.state_dict(),
-                                               DY_STATE_DICT_SAVE_PATH)
-                break
-        return loss, ppl
-
-
-def train_dygraph(bert_config, data_reader):
-    program_translator.enable(False)
-    return train(bert_config, data_reader, False)
-
-
-def train_static(bert_config, data_reader):
-    program_translator.enable(True)
-    return train(bert_config, data_reader, True)
-
-
-def predict_static(data):
-    paddle.enable_static()
-    exe = fluid.Executor(place)
-    # load inference model
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_DIR,
-         executor=exe,
-         model_filename=MODEL_FILENAME,
-         params_filename=PARAMS_FILENAME)
-    pred_res = exe.run(inference_program,
-                       feed=dict(zip(feed_target_names, data)),
-                       fetch_list=fetch_targets)
-
-    return pred_res
-
-
-def predict_dygraph(bert_config, data):
-    program_translator.enable(False)
-    with fluid.dygraph.guard(place):
-        bert = PretrainModelLayer(
-            config=bert_config, weight_sharing=False, use_fp16=False)
-        model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
-
-        bert.set_dict(model_dict)
-        bert.eval()
-
-        input_vars = [fluid.dygraph.to_variable(x) for x in data]
-        src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_vars
-        pred_res = bert(
-            src_ids=src_ids,
-            position_ids=pos_ids,
-            sentence_ids=sent_ids,
-            input_mask=input_mask,
-            mask_label=mask_label,
-            mask_pos=mask_pos,
-            labels=labels)
-        pred_res = [var.numpy() for var in pred_res]
+            pred_res = [var.numpy() for var in pred_res]
 
-        return pred_res
-
-
-def predict_dygraph_jit(data):
-    with fluid.dygraph.guard(place):
-        bert = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
-        bert.eval()
-
-        src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
-        pred_res = bert(src_ids, pos_ids, sent_ids, input_mask, mask_label,
-                        mask_pos, labels)
-        pred_res = [var.numpy() for var in pred_res]
-
-        return pred_res
+            return pred_res
 
+    def predict_dygraph_jit(self, data):
+        with fluid.dygraph.guard(place):
+            bert = fluid.dygraph.jit.load(self.model_save_prefix)
+            bert.eval()
 
-def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
-                            data)
-    out = output()
-    return out
+            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
+            pred_res = bert(src_ids, pos_ids, sent_ids, input_mask, mask_label,
+                            mask_pos, labels)
+            pred_res = [var.numpy() for var in pred_res]
 
+            return pred_res
 
-class TestBert(unittest.TestCase):
-    def setUp(self):
-        self.bert_config = get_bert_config()
-        self.data_reader = get_feed_data_reader(self.bert_config)
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, data)
+        out = output()
+        return out
 
     def test_train(self):
-        static_loss, static_ppl = train_static(self.bert_config,
-                                               self.data_reader)
-        dygraph_loss, dygraph_ppl = train_dygraph(self.bert_config,
-                                                  self.data_reader)
+        static_loss, static_ppl = self.train_static(self.bert_config,
+                                                    self.data_reader)
+        dygraph_loss, dygraph_ppl = self.train_dygraph(self.bert_config,
+                                                       self.data_reader)
         self.assertTrue(
             np.allclose(static_loss, dygraph_loss),
             msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
@@ -193,10 +193,10 @@ def test_train(self):
 
     def verify_predict(self):
         for data in self.data_reader.data_generator()():
-            dygraph_pred_res = predict_dygraph(self.bert_config, data)
-            static_pred_res = predict_static(data)
-            dygraph_jit_pred_res = predict_dygraph_jit(data)
-            predictor_pred_res = predict_analysis_inference(data)
+            dygraph_pred_res = self.predict_dygraph(self.bert_config, data)
+            static_pred_res = self.predict_static(data)
+            dygraph_jit_pred_res = self.predict_dygraph_jit(data)
+            predictor_pred_res = self.predict_analysis_inference(data)
 
             for dy_res, st_res, dy_jit_res, predictor_res in zip(
                     dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
index f7d469327a307..95ea5ad227eeb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
@@ -18,8 +18,7 @@
 import numpy as np
 from paddle.jit import ProgramTranslator
 
-from test_resnet import ResNet, train, predict_dygraph_jit
-from test_resnet import predict_dygraph, predict_static, predict_analysis_inference
+from test_resnet import ResNet, ResNetHelper
 
 program_translator = ProgramTranslator()
 
@@ -31,20 +30,20 @@ def setUp(self):
         self.build_strategy.fuse_bn_act_ops = True
         self.build_strategy.fuse_bn_add_act_ops = True
         self.build_strategy.enable_addto = True
+        self.resnet_helper = ResNetHelper()
         # NOTE: for enable_addto
         paddle.fluid.set_flags({"FLAGS_max_inplace_grad_add": 8})
 
     def train(self, to_static):
         program_translator.enable(to_static)
-
-        return train(to_static, self.build_strategy)
+        return self.resnet_helper.train(to_static, self.build_strategy)
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = predict_dygraph(image)
-        st_pre = predict_static(image)
-        dy_jit_pre = predict_dygraph_jit(image)
-        predictor_pre = predict_analysis_inference(image)
+        dy_pre = self.resnet_helper.predict_dygraph(image)
+        st_pre = self.resnet_helper.predict_static(image)
+        dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
+        predictor_pre = self.resnet_helper.predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -69,7 +68,7 @@ def test_in_static_mode_mkldnn(self):
         paddle.fluid.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.fluid.core.is_compiled_with_mkldnn():
-                train(True, self.build_strategy)
+                self.resnet_helper.train(True, self.build_strategy)
         finally:
             paddle.fluid.set_flags({'FLAGS_use_mkldnn': False})
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index efb69b530efc9..1a531c65bbf1e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -14,8 +14,10 @@
 
 from __future__ import print_function
 
+import os
 import math
 import time
+import tempfile
 import unittest
 
 import numpy as np
@@ -39,11 +41,6 @@
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
 
-MODEL_SAVE_DIR = "./inference"
-MODEL_SAVE_PREFIX = "./inference/resnet"
-MODEL_FILENAME = "resnet" + INFER_MODEL_SUFFIX
-PARAMS_FILENAME = "resnet" + INFER_PARAMS_SUFFIX
-DY_STATE_DICT_SAVE_PATH = "./resnet.dygraph"
 program_translator = ProgramTranslator()
 
 if fluid.is_compiled_with_cuda():
@@ -212,130 +209,148 @@ def __reader__():
     return __reader__
 
 
-def train(to_static, build_strategy=None):
-    """
-    Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
-    """
-    with fluid.dygraph.guard(place):
-        np.random.seed(SEED)
-        paddle.seed(SEED)
-        paddle.framework.random._manual_program_seed(SEED)
-
-        train_reader = paddle.batch(
-            reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-            batch_size=batch_size,
-            drop_last=True)
-        data_loader = fluid.io.DataLoader.from_generator(
-            capacity=5, iterable=True)
-        data_loader.set_sample_list_generator(train_reader)
-
-        resnet = ResNet()
-        if to_static:
-            resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
-        optimizer = optimizer_setting(parameter_list=resnet.parameters())
-
-        for epoch in range(epoch_num):
-            total_loss = 0.0
-            total_acc1 = 0.0
-            total_acc5 = 0.0
-            total_sample = 0
-
-            for batch_id, data in enumerate(data_loader()):
-                start_time = time.time()
-                img, label = data
-
-                pred = resnet(img)
-                loss = fluid.layers.cross_entropy(input=pred, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
-                acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
-                acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
-
-                avg_loss.backward()
-                optimizer.minimize(avg_loss)
-                resnet.clear_gradients()
-
-                total_loss += avg_loss
-                total_acc1 += acc_top1
-                total_acc5 += acc_top5
-                total_sample += 1
-
-                end_time = time.time()
-                if batch_id % 2 == 0:
-                    print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
-                        ( epoch, batch_id, total_loss.numpy() / total_sample, \
-                            total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
-                if batch_id == 10:
-                    if to_static:
-                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PREFIX)
-                    else:
-                        fluid.dygraph.save_dygraph(resnet.state_dict(),
-                                                   DY_STATE_DICT_SAVE_PATH)
-                    # avoid dataloader throw abort signaal
-                    data_loader._reset()
-                    break
-
-    return total_loss.numpy()
-
-
-def predict_dygraph(data):
-    program_translator.enable(False)
-    with fluid.dygraph.guard(place):
-        resnet = ResNet()
-
-        model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
-        resnet.set_dict(model_dict)
-        resnet.eval()
-
-        pred_res = resnet(fluid.dygraph.to_variable(data))
-
-        return pred_res.numpy()
-
-
-def predict_static(data):
-    paddle.enable_static()
-    exe = fluid.Executor(place)
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_DIR,
-         executor=exe,
-         model_filename=MODEL_FILENAME,
-         params_filename=PARAMS_FILENAME)
-
-    pred_res = exe.run(inference_program,
-                       feed={feed_target_names[0]: data},
-                       fetch_list=fetch_targets)
-
-    return pred_res[0]
-
-
-def predict_dygraph_jit(data):
-    with fluid.dygraph.guard(place):
-        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
-        resnet.eval()
-
-        pred_res = resnet(data)
-
-        return pred_res.numpy()
-
-
-def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
-                            [data])
-    out = output()
-    return out
+class ResNetHelper:
+    def __init__(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+        self.model_save_prefix = os.path.join(self.model_save_dir, 'resnet')
+        self.model_filename = 'resnet' + INFER_MODEL_SUFFIX
+        self.params_filename = 'resnet' + INFER_PARAMS_SUFFIX
+        self.dy_state_dict_save_path = os.path.join(self.temp_dir.name,
+                                                    'resnet.dygraph')
+
+    def __del__(self):
+        self.temp_dir.cleanup()
+
+    def train(self, to_static, build_strategy=None):
+        """
+        Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+        """
+        with fluid.dygraph.guard(place):
+            np.random.seed(SEED)
+            paddle.seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
+
+            train_reader = paddle.batch(
+                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+                batch_size=batch_size,
+                drop_last=True)
+            data_loader = fluid.io.DataLoader.from_generator(
+                capacity=5, iterable=True)
+            data_loader.set_sample_list_generator(train_reader)
+
+            resnet = ResNet()
+            if to_static:
+                resnet = paddle.jit.to_static(
+                    resnet, build_strategy=build_strategy)
+            optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+            for epoch in range(epoch_num):
+                total_loss = 0.0
+                total_acc1 = 0.0
+                total_acc5 = 0.0
+                total_sample = 0
+
+                for batch_id, data in enumerate(data_loader()):
+                    start_time = time.time()
+                    img, label = data
+
+                    pred = resnet(img)
+                    loss = fluid.layers.cross_entropy(input=pred, label=label)
+                    avg_loss = fluid.layers.mean(x=loss)
+                    acc_top1 = fluid.layers.accuracy(
+                        input=pred, label=label, k=1)
+                    acc_top5 = fluid.layers.accuracy(
+                        input=pred, label=label, k=5)
+
+                    avg_loss.backward()
+                    optimizer.minimize(avg_loss)
+                    resnet.clear_gradients()
+
+                    total_loss += avg_loss
+                    total_acc1 += acc_top1
+                    total_acc5 += acc_top5
+                    total_sample += 1
+
+                    end_time = time.time()
+                    if batch_id % 2 == 0:
+                        print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
+                            ( epoch, batch_id, total_loss.numpy() / total_sample, \
+                                total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
+                    if batch_id == 10:
+                        if to_static:
+                            fluid.dygraph.jit.save(resnet,
+                                                   self.model_save_prefix)
+                        else:
+                            fluid.dygraph.save_dygraph(
+                                resnet.state_dict(),
+                                self.dy_state_dict_save_path)
+                        # avoid dataloader throw abort signaal
+                        data_loader._reset()
+                        break
+
+        return total_loss.numpy()
+
+    def predict_dygraph(self, data):
+        program_translator.enable(False)
+        with fluid.dygraph.guard(place):
+            resnet = ResNet()
+
+            model_dict, _ = fluid.dygraph.load_dygraph(
+                self.dy_state_dict_save_path)
+            resnet.set_dict(model_dict)
+            resnet.eval()
+
+            pred_res = resnet(fluid.dygraph.to_variable(data))
+
+            return pred_res.numpy()
+
+    def predict_static(self, data):
+        paddle.enable_static()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.model_save_dir,
+             executor=exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+
+        pred_res = exe.run(inference_program,
+                           feed={feed_target_names[0]: data},
+                           fetch_list=fetch_targets)
+
+        return pred_res[0]
+
+    def predict_dygraph_jit(self, data):
+        with fluid.dygraph.guard(place):
+            resnet = fluid.dygraph.jit.load(self.model_save_prefix)
+            resnet.eval()
+
+            pred_res = resnet(data)
+
+            return pred_res.numpy()
+
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, [data])
+        out = output()
+        return out
 
 
 class TestResnet(unittest.TestCase):
+    def setUp(self):
+        self.resnet_helper = ResNetHelper()
+
     def train(self, to_static):
         program_translator.enable(to_static)
-        return train(to_static)
+        return self.resnet_helper.train(to_static)
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = predict_dygraph(image)
-        st_pre = predict_static(image)
-        dy_jit_pre = predict_dygraph_jit(image)
-        predictor_pre = predict_analysis_inference(image)
+        dy_pre = self.resnet_helper.predict_dygraph(image)
+        st_pre = self.resnet_helper.predict_static(image)
+        dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
+        predictor_pre = self.resnet_helper.predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -360,7 +375,7 @@ def test_in_static_mode_mkldnn(self):
         fluid.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.fluid.core.is_compiled_with_mkldnn():
-                train(to_static=True)
+                self.resnet_helper.train(to_static=True)
         finally:
             fluid.set_flags({'FLAGS_use_mkldnn': False})
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 06f2c60dfae9f..c8fe3e3932914 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -15,6 +15,7 @@
 import logging
 import os
 import time
+import tempfile
 import unittest
 
 import numpy as np
@@ -371,8 +372,21 @@ def predict_static(args, batch_generator):
 
 
 class TestTransformer(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDwon(self):
+        self.temp_dir.cleanup()
+
     def prepare(self, mode='train'):
         args = util.ModelHyperParams()
+        args.save_dygraph_model_path = os.path.join(
+            self.temp_dir.name, args.save_dygraph_model_path)
+        args.save_static_model_path = os.path.join(self.temp_dir.name,
+                                                   args.save_static_model_path)
+        args.inference_model_dir = os.path.join(self.temp_dir.name,
+                                                args.inference_model_dir)
+        args.output_file = os.path.join(self.temp_dir.name, args.output_file)
         batch_generator = util.get_feed_data_reader(args, mode)
         return args, batch_generator
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index 45f75f1b4df81..21a6655406729 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -27,12 +27,13 @@ def test_set_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
         skip_options = []
+        skip_options.append(
+            'mean_accumulation_and_replication_reduction_strategy')
         skip_options.append('random_seed')
 
         for option_name in all_option_names:
             if option_name in skip_options:
                 continue
-
             option = ipu_strategy._ipu_strategy.get_option(option_name)
             option_type = option['type']
             option_value = option['value']
@@ -67,7 +68,7 @@ def test_set_string_options(self):
     def test_set_other_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         options = {}
-        options['dot_checks'] = ['0', '1', '2', '3']
+        options['dot_checks'] = ['Fwd0', 'Fwd1', 'Bwd0', 'PreAlias', "Final"]
         options['engine_options'] = {
             'debug.allowOutOfMemory': 'true',
             'autoReport.directory': 'path',
@@ -76,7 +77,12 @@ def test_set_other_options(self):
         options['random_seed'] = 1234
         for k, v in options.items():
             ipu_strategy.set_options({k: v})
-            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+            if (isinstance(v, list)):
+                assert v.sort() == ipu_strategy.get_option(k).sort(
+                ), f"set {k} to {v} failed "
+            else:
+                assert v == ipu_strategy.get_option(
+                    k), f"set {k} to {v} failed "
 
         # The custom logger need 2 int as inputs
         logger = lambda progress, total: print(f"compile progrss: {progress}/{total}")
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
index 792b88849faf3..884162d336f35 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
@@ -148,6 +148,36 @@ def set_data_feed(self):
         }
 
 
+class TestReplicaCollectiveInference(TestBase):
+    def set_attrs(self):
+        self.ipu_options = {
+            "batches_per_step": 1,
+            "enable_pipelining": False,
+            "enable_gradient_accumulation": False,
+            "accumulation_factor": 1,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2,
+            "accumulate_outer_fragment": {
+                0: []
+            },
+            "replicated_collectives_settings": {
+                "prepare_schedule_for_merging_collectives": True,
+                "merge_all_reduce_collectives": True
+            }
+        }
+        self.cpu_bs = 1
+        self.ipu_bs = 1
+
+    def set_data_feed(self):
+        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
+        self.feed_cpu = {"image": np_image}
+        self.feed_ipu = {
+            "image":
+            np.tile(np_image,
+                    [self.ipu_options['replicated_graph_count'], 1, 1, 1])
+        }
+
+
 class TestPipelineInference(TestBase):
     def set_attrs(self):
         self.ipu_options = {
@@ -190,6 +220,36 @@ def set_attrs(self):
 
 
 class TestReplicaTrain(TestTrainBase):
+    def set_attrs(self):
+        self.ipu_options = {
+            "batches_per_step": 1,
+            "enable_pipelining": False,
+            "enable_gradient_accumulation": False,
+            "accumulation_factor": 1,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2
+        }
+        self.cpu_bs = 2
+        self.ipu_bs = 1
+        self.optimizer = 'sgd'
+
+    def set_data_feed(self):
+        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
+        self.feed_cpu = {"image": np.tile(np_image, [self.cpu_bs, 1, 1, 1])}
+        self.feed_ipu = {
+            "image":
+            np.tile(np_image,
+                    [self.ipu_options['replicated_graph_count'], 1, 1, 1])
+        }
+
+    def test(self):
+        cpu_outputs = self._test_base(False)
+        ipu_outputs = self._test_base(True)[::2]
+
+        self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=self.atol))
+
+
+class TestReplicaCollectiveTrain(TestTrainBase):
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -198,6 +258,13 @@ def set_attrs(self):
             "accumulation_factor": 1,
             "enable_replicated_graphs": True,
             "replicated_graph_count": 2,
+            "accumulate_outer_fragment": {
+                0: []
+            },
+            "replicated_collectives_settings": {
+                "prepare_schedule_for_merging_collectives": True,
+                "merge_all_reduce_collectives": True
+            }
         }
         self.cpu_bs = 2
         self.ipu_bs = 1
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 661fbbc7759c6..4717dfa1eab52 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -143,5 +143,6 @@ if (WITH_MKLDNN)
   set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
   set_tests_properties(test_mkldnn_fc_mish_fuse_pass PROPERTIES TIMEOUT 300)
   set_tests_properties(test_mkldnn_fc_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass PROPERTIES TIMEOUT 60)
 endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index bb8c6e73fdefa..161c785ef8565 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -388,7 +388,7 @@ def run_test(prog_config):
         used_time = time.time() - start_time
         if max_duration > 0 and used_time > max_duration:
             logging.error(
-                "The duration exceeds {} seconds, if this is neccessary, try to set a larger number for parameter `max_duration`.".
+                "The duration exceeds {} seconds, if this is necessary, try to set a larger number for parameter `max_duration`.".
                 format(max_duration))
             assert False
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
new file mode 100644
index 0000000000000..a35b75e69f812
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestConvAffineChannelFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        groups = draw(st.integers(min_value=1, max_value=3))
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        axis = draw(st.sampled_from([1]))
+        filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
+        filter_size = draw(st.integers(min_value=1, max_value=4))
+        in_channel = groups * filter_channel
+        out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
+        out_channel = groups * out_channel_factor
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+        dilations = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=2), min_size=2, max_size=2))
+        paddings = draw(
+            st.lists(
+                st.integers(
+                    min_value=0, max_value=2), min_size=2, max_size=2))
+        strides = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=2), min_size=2, max_size=2))
+        has_bias = draw(st.booleans())
+
+        x_shape = [
+            batch_size, in_channel, 64, 64
+        ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
+        w_shape = [out_channel, filter_channel, filter_size, filter_size]
+        scale_shape = [out_channel]
+        bias_shape = [out_channel]
+
+        def generate_input():
+            return np.random.random(x_shape).astype(np.float32)
+
+        def generate_weight():
+            return np.random.random(w_shape).astype(np.float32)
+
+        def generate_bias():
+            return np.random.random(bias_shape).astype(np.float32)
+
+        def generate_scale_bias():
+            return np.random.random(bias_shape).astype(np.float32)
+
+        conv2d_op = OpConfig(
+            "conv2d",
+            inputs={
+                "Input": ["input_data"],
+                "Filter": ["conv2d_weight"],
+            },
+            outputs={"Output": ["conv_output"]},
+            data_format=data_format,
+            dilations=dilations,
+            padding_algorithm=padding_algorithm,
+            groups=groups,
+            paddings=paddings,
+            strides=strides,
+            has_bias=has_bias,
+            is_test=True)
+        ac_op = OpConfig(
+            "affine_channel",
+            inputs={
+                "X": ["conv_output"],
+                "Scale": ["affine_channel_scale"],
+                "Bias": ["affine_channel_bias"]
+            },
+            outputs={"Out": ["affine_channel_ouput"]},
+            data_layout=data_format)
+        if has_bias == True:
+            conv2d_op.inputs["Bias"] = ["conv2d_bias"]
+        ops = [conv2d_op, ac_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input)),
+            },
+            weights={
+                "conv2d_weight":
+                TensorConfig(data_gen=partial(generate_weight)),
+                "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)),
+                "affine_channel_scale":
+                TensorConfig(data_gen=partial(generate_scale_bias)),
+                "affine_channel_bias":
+                TensorConfig(data_gen=partial(generate_scale_bias)),
+            },
+            outputs=["affine_channel_ouput"])
+        if has_bias == True:
+            program_config.weights["conv2d_bias"] = TensorConfig(
+                data_gen=partial(generate_bias))
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
+
+    def add_ignore_pass_case(self):
+        # If the problem has been fixed, the judgment 
+        # in is_program_valid needs to be deleted!!!
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs['data_format'] == "NHWC":
+                return True
+            return False
+
+        # mkldnn Output has diff with bias!
+        def teller2(program_config, predictor_config):
+            return predictor_config.mkldnn_enabled() and program_config.ops[
+                0].attrs['has_bias'] == True
+
+        self.add_ignore_check_case(
+            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The output format of conv2d is wrong when data_format attribute is NHWC, \
+            because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)."
+        )
+
+        self.add_ignore_check_case(
+            teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
+            "Currently mkldnn Output has diff with bias!")
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            passes=["conv_affine_channel_mkldnn_fuse_pass"], )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
new file mode 100644
index 0000000000000..31415f6472587
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import unittest
+
+import hypothesis.strategies as st
+
+
+class TestInt8ScaleCalculationMkldnnPass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=False)
+        config.pass_builder().append_pass("int8_scale_calculation_mkldnn_pass")
+        yield config, ["conv2d"], (1e-4, 1e-5)
+
+    def is_program_valid(self, prog_config):
+        paddings = prog_config.ops[0].attrs["paddings"]
+        strides = prog_config.ops[0].attrs["strides"]
+        groups = prog_config.ops[0].attrs["groups"]
+        padding_algorithm = prog_config.ops[0].attrs["padding_algorithm"]
+        dilations = prog_config.ops[0].attrs["dilations"]
+        data_format = prog_config.ops[0].attrs["data_format"]
+        filter_shape = prog_config.weights["filter"].shape
+        input_shape = prog_config.inputs["input_x"].shape
+        if padding_algorithm == "VALID":
+            if ((input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \
+            ((input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1:
+                return False
+        if padding_algorithm == "EXPLICIT":
+            if ((input_shape[2] + paddings[0] + paddings[1] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \
+                ((input_shape[3] + paddings[2] + paddings[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1:
+                return False
+        if data_format == "NCHW":
+            if input_shape[1] != filter_shape[1] * groups:
+                return False
+            if filter_shape[0] % groups != 0:
+                return False
+        else:
+            if input_shape[3] != filter_shape[1] * groups:
+                return False
+            if filter_shape[0] % groups != 0:
+                return False
+        return True
+
+    def sample_program_config(self, draw):
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=5, max_value=100), min_size=4, max_size=4))
+        x_shape[1] = draw(st.integers(min_value=5, max_value=10))
+
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+
+        f_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=4, max_size=4))
+        if data_format == "NCHW":
+            f_shape[1] = x_shape[1]
+        else:
+            f_shape[1] = x_shape[3]
+
+        strides = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=2))
+
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+
+        padding = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=4, max_size=4))
+
+        groups = draw(st.integers(min_value=1, max_value=3))
+
+        dilations = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=2))
+
+        bias_shape = [f_shape[0]]
+        inputs = dict()
+        weights = dict()
+        use_mkldnn = True
+
+        has_bias = draw(st.booleans())
+        if has_bias:
+            inputs = {
+                "Input": ["input_x"],
+                "Filter": ["filter"],
+            }
+            weights = {
+                "filter": TensorConfig(shape=f_shape),
+                "bias": TensorConfig(shape=bias_shape),
+            }
+        else:
+            inputs = {
+                "Input": ["input_x"],
+                "Filter": ["filter"],
+            }
+            weights = {"filter": TensorConfig(shape=f_shape), }
+
+        conv2d_op = OpConfig(
+            "conv2d",
+            inputs=inputs,
+            outputs={"Output": ["conv2d_out"]},
+            strides=strides,
+            padding_algorithm=padding_algorithm,
+            paddings=padding,
+            groups=groups,
+            dilations=dilations,
+            data_format=data_format,
+            use_mkldnn=use_mkldnn,
+            mkldnn_data_type="int8")
+
+        ops = [conv2d_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights=weights,
+            inputs={"input_x": TensorConfig(shape=x_shape)},
+            outputs=["conv2d_out"])
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=100,
+            passes=["int8_scale_calculation_mkldnn_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
index a864e2fe5a1c8..1781eb5048347 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
@@ -62,13 +62,13 @@ def generate_reshape2_Input():
             "transpose2",
             inputs={"X": ["reshape2_output1"], },
             outputs={
-                "Out": ["transpose2_ouput"],
+                "Out": ["transpose2_output"],
                 "XShape": ["transpose2_xshape"]
             },
             axis=axis_v)
         reshape2_op2 = OpConfig(
             "reshape2",
-            inputs={"X": ["transpose2_ouput"], },
+            inputs={"X": ["transpose2_output"], },
             outputs={
                 "Out": ["reshape2_output2"],
                 "XShape": ["reshape2_xshape2"]
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
index 04eb3ab10ba7a..6a204ebbad27d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
@@ -103,6 +103,9 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 for x in attrs[0]["axes"]:
                     if x == 0:
                         return 0, 3
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+                return 0, 3
             return 1, 2
 
         attrs = [
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
new file mode 100644
index 0000000000000..2abf0a1acda67
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertActivationTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
+            if dims == 1:
+                return np.ones([32]).astype(np.float32)
+            elif dims == 2:
+                return np.ones([3, 32]).astype(np.float32)
+            elif dims == 3:
+                return np.ones([3, 32, 32]).astype(np.float32)
+            else:
+                return np.ones([batch, 3, 32, 32]).astype(np.float32)
+
+        for dims in [1, 2, 3, 4]:
+            for batch in [1, 4]:
+                for op_type in ["exp", "log"]:
+                    self.dims = dims
+                    dics = [{}]
+
+                    ops_config = [{
+                        "op_type": op_type,
+                        "op_inputs": {
+                            "X": ["input_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(data_gen=partial(
+                                generate_input1, dims, batch, dics))
+                        },
+                        outputs=["output_data"])
+
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"input_data": [1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [32]}
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 16]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]}
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]}
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 3, 16, 16]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 3, 32, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 3, 32, 32]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.dims == 1:
+                return 0, 3
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py
new file mode 100644
index 0000000000000..ece2d187fb9da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List, Dict, Any
+import unittest
+
+
+class TrtConvertYoloBoxHeadTest(TrtLayerAutoScanTest):
+    def sample_program_configs(self):
+        def generate_input(attrs: List[Dict[str, Any]], batch, shape):
+            gen_shape = shape.copy()
+            gen_shape.insert(0, batch)
+            return np.random.uniform(0, 1, gen_shape).astype("float32")
+
+        input_shape = [[255, 19, 19], [255, 38, 38], [255, 76, 76]]
+        anchors = [[116, 90, 156, 198, 373, 326], [30, 61, 62, 45, 59, 119],
+                   [10, 13, 16, 30, 33, 23]]
+        class_num = 80
+        for batch in [1, 4]:
+            for i in range(len(anchors)):
+                attrs_dict = {
+                    "anchors": anchors[i],
+                    "class_num": class_num,
+                }
+                ops_config = [{
+                    "op_type": "yolo_box_head",
+                    "op_inputs": {
+                        "X": ["yolo_box_head_input"],
+                    },
+                    "op_outputs": {
+                        "Out": ["yolo_box_head_output"],
+                    },
+                    "op_attrs": attrs_dict
+                }]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "yolo_box_head_input": TensorConfig(data_gen=partial(
+                            generate_input, attrs_dict, batch, input_shape[i]))
+                    },
+                    outputs=["yolo_box_head_output"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        # for static_shape
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), [1, 2], 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py b/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py
new file mode 100644
index 0000000000000..2fb83fb039215
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+paddle.enable_static()
+
+
+def yolo_box_post(box0,
+                  box1,
+                  box2,
+                  im_shape,
+                  im_scale,
+                  anchors0=[116, 90, 156, 198, 373, 326],
+                  anchors1=[30, 61, 62, 45, 59, 119],
+                  anchors2=[10, 13, 16, 30, 33, 23],
+                  class_num=80,
+                  conf_thresh=0.005,
+                  downsample_ratio0=32,
+                  downsample_ratio1=16,
+                  downsample_ratio2=8,
+                  clip_bbox=True,
+                  scale_x_y=1.,
+                  nms_threshold=0.45):
+    helper = LayerHelper('yolo_box_post', **locals())
+    output = helper.create_variable_for_type_inference(dtype=box0.dtype)
+    nms_rois_num = helper.create_variable_for_type_inference(dtype='int32')
+    inputs = {
+        'Boxes0': box0,
+        'Boxes1': box1,
+        'Boxes2': box2,
+        "ImageShape": im_shape,
+        "ImageScale": im_scale
+    }
+    outputs = {'Out': output, 'NmsRoisNum': nms_rois_num}
+
+    helper.append_op(
+        type="yolo_box_post",
+        inputs=inputs,
+        attrs={
+            'anchors0': anchors0,
+            'anchors1': anchors1,
+            'anchors2': anchors2,
+            'class_num': class_num,
+            'conf_thresh': conf_thresh,
+            'downsample_ratio0': downsample_ratio0,
+            'downsample_ratio1': downsample_ratio1,
+            'downsample_ratio2': downsample_ratio2,
+            'clip_bbox': clip_bbox,
+            'scale_x_y': scale_x_y,
+            'nms_threshold': nms_threshold
+        },
+        outputs=outputs)
+    output.stop_gradient = True
+    nms_rois_num.stop_gradient = True
+    return output, nms_rois_num
+
+
+@unittest.skipIf(not paddle.is_compiled_with_cuda(),
+                 "only support cuda kernel.")
+class TestYoloBoxPost(unittest.TestCase):
+    def test_yolo_box_post(self):
+        place = paddle.CUDAPlace(0)
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(program, startup_program):
+            box0 = paddle.static.data("box0", [1, 255, 19, 19])
+            box1 = paddle.static.data("box1", [1, 255, 38, 38])
+            box2 = paddle.static.data("box2", [1, 255, 76, 76])
+            im_shape = paddle.static.data("im_shape", [1, 2])
+            im_scale = paddle.static.data("im_scale", [1, 2])
+            out, rois_num = yolo_box_post(box0, box1, box2, im_shape, im_scale)
+        exe = paddle.static.Executor(place)
+        exe.run(startup_program)
+        feed = {
+            "box0": np.random.uniform(size=[1, 255, 19, 19]).astype("float32"),
+            "box1": np.random.uniform(size=[1, 255, 38, 38]).astype("float32"),
+            "box2": np.random.uniform(size=[1, 255, 76, 76]).astype("float32"),
+            "im_shape": np.array([[608., 608.]], "float32"),
+            "im_scale": np.array([[1., 1.]], "float32")
+        }
+        outs = exe.run(program, feed=feed, fetch_list=[out.name, rois_num.name])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py
index aae1cc65c9220..e92821387aed4 100644
--- a/python/paddle/fluid/tests/unittests/ir/pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py
@@ -167,7 +167,7 @@ def check_output_with_place(self, place, startup_on_cpu=False, atol=1e-5):
 
     def _check_fused_ops(self, program):
         '''
-        Check the number of specified fused op is equal to the the expected
+        Check the number of specified fused op is equal to the expected
         number.
         '''
         if self.fused_op_type is None or self.num_fused_ops < 0:
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py
new file mode 100644
index 0000000000000..02fb890220431
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+paddle.enable_static()
+
+
+def multiclass_nms(bboxes,
+                   scores,
+                   score_threshold,
+                   nms_top_k,
+                   keep_top_k,
+                   nms_threshold=0.3,
+                   normalized=True,
+                   nms_eta=1.,
+                   background_label=-1):
+    helper = LayerHelper('multiclass_nms3', **locals())
+    output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+    index = helper.create_variable_for_type_inference(dtype='int32')
+    nms_rois_num = helper.create_variable_for_type_inference(dtype='int32')
+    inputs = {'BBoxes': bboxes, 'Scores': scores}
+    outputs = {'Out': output, 'Index': index, 'NmsRoisNum': nms_rois_num}
+
+    helper.append_op(
+        type="multiclass_nms3",
+        inputs=inputs,
+        attrs={
+            'background_label': background_label,
+            'score_threshold': score_threshold,
+            'nms_top_k': nms_top_k,
+            'nms_threshold': nms_threshold,
+            'keep_top_k': keep_top_k,
+            'nms_eta': nms_eta,
+            'normalized': normalized
+        },
+        outputs=outputs)
+    output.stop_gradient = True
+    index.stop_gradient = True
+
+    return output, index, nms_rois_num
+
+
+class TestYoloBoxPass(unittest.TestCase):
+    def test_yolo_box_pass(self):
+        program = paddle.static.Program()
+        with paddle.static.program_guard(program):
+            im_shape = paddle.static.data("im_shape", [1, 2])
+            im_scale = paddle.static.data("im_scale", [1, 2])
+            yolo_box0_x = paddle.static.data("yolo_box0_x", [1, 255, 19, 19])
+            yolo_box1_x = paddle.static.data("yolo_box1_x", [1, 255, 38, 38])
+            yolo_box2_x = paddle.static.data("yolo_box2_x", [1, 255, 76, 76])
+            div = paddle.divide(im_shape, im_scale)
+            cast = paddle.cast(div, "int32")
+            boxes0, scores0 = paddle.vision.ops.yolo_box(
+                yolo_box0_x, cast, [116, 90, 156, 198, 373, 326], 80, 0.005, 32)
+            boxes1, scores1 = paddle.vision.ops.yolo_box(
+                yolo_box1_x, cast, [30, 61, 62, 45, 59, 119], 80, 0.005, 16)
+            boxes2, scores2 = paddle.vision.ops.yolo_box(
+                yolo_box2_x, cast, [10, 13, 16, 30, 33, 23], 80, 0.005, 8)
+            transpose0 = paddle.transpose(scores0, [0, 2, 1])
+            transpose1 = paddle.transpose(scores1, [0, 2, 1])
+            transpose2 = paddle.transpose(scores2, [0, 2, 1])
+            concat0 = paddle.concat([boxes0, boxes1, boxes2], 1)
+            concat1 = paddle.concat([transpose0, transpose1, transpose2], 2)
+            out0, out1, out2 = multiclass_nms(concat0, concat1, 0.01, 1000, 100,
+                                              0.45, True, 1., 80)
+        graph = core.Graph(program.desc)
+        core.get_pass("yolo_box_fuse_pass").apply(graph)
+        graph = paddle.fluid.framework.IrGraph(graph)
+        op_nodes = graph.all_op_nodes()
+        for op_node in op_nodes:
+            op_type = op_node.op().type()
+            self.assertTrue(op_type in ["yolo_box_head", "yolo_box_post"])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
index 90614ccb3bc15..11b8858b6b195 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -20,6 +20,8 @@
 import os
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _global_flags
+from paddle.fluid.framework import _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 
 def check():
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py
new file mode 100644
index 0000000000000..d729efbb0fb60
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool
+import paddle
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestFillConstant2DOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant"
+        self.dtype = np.float32
+
+        self.shape_tensor_list = None
+        self.shape_tensor = None
+        self.str_value = ""
+        real_shape = []
+        self.value = 0.1
+
+        self.set_inputs()
+        self.set_attrs()
+
+        if 'value' in self.attrs:
+            self.value = self.attrs['value']
+        if self.str_value != "":
+            self.value = float(self.str_value)
+        if 'ValueTensor' in self.inputs:
+            self.value = self.inputs['ValueTensor']
+
+        if 'shape' in self.attrs:
+            real_shape = self.attrs['shape']
+        if 'ShapeTensor' in self.inputs:
+            real_shape = list(self.inputs['ShapeTensor'])
+        if 'ShapeTensorList' in self.inputs:
+            real_shape = []
+            for shape_tensor in self.inputs['ShapeTensorList']:
+                real_shape.append(shape_tensor[1].item())
+
+        self.outputs = {'Out': np.full(real_shape, self.value)}
+
+    def set_inputs(self):
+        self.inputs = {}
+
+    def set_attrs(self):
+        self.attrs = {'shape': (3, 5), 'use_mkldnn': True, 'value': self.value}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFillZerosLike4DShapeTensorPriorityOneDNNOp(
+        TestFillConstant2DOneDNNOp):
+    def set_inputs(self):
+        self.inputs = {'ShapeTensor': np.array([5, 6, 7, 8]).astype("int32")}
+
+
+class TestFillZerosLike4DShapeTensorListPriorityOneDNNOp(
+        TestFillConstant2DOneDNNOp):
+    def set_inputs(self):
+        shape = (4, 5, 6, 7)
+        self.shape_tensor_list = []
+        for index, elem in enumerate(shape):
+            self.shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * elem))
+
+        self.inputs = {'ShapeTensorList': self.shape_tensor_list}
+
+
+class TestFillZerosLike2DStringValueInfOneDNNOp(TestFillConstant2DOneDNNOp):
+    def set_attrs(self):
+        self.str_value = "inf"
+        self.attrs = {'shape': (10, 13), 'use_mkldnn': True, 'str_value': "inf"}
+
+
+class TestFillZerosLike2DStringValueMinusInfOneDNNOp(
+        TestFillConstant2DOneDNNOp):
+    def set_attrs(self):
+        self.str_value = "-inf"
+        self.attrs = {
+            'shape': (10, 13),
+            'use_mkldnn': True,
+            'str_value': "-inf"
+        }
+
+
+class TestFillZerosLike2DStringValueFloatOneDNNOp(TestFillConstant2DOneDNNOp):
+    def set_attrs(self):
+        self.str_value = "0.123"
+        self.attrs = {
+            'shape': (10, 13),
+            'use_mkldnn': True,
+            'str_value': "0.123"
+        }
+
+
+class TestFillZerosLike2DValueTensorPriorityOneDNNOp(
+        TestFillZerosLike2DStringValueFloatOneDNNOp):
+    def set_inputs(self):
+        self.inputs = {'ValueTensor': np.atleast_1d(2.25).astype("float32")}
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
new file mode 100644
index 0000000000000..f30a391f65385
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
@@ -0,0 +1,303 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_adam_op import adam_step
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestAdam(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamWithEpsilonTensor(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamOpWithSkipUpdate(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+            "SkipUpdate": np.array([True]).astype("bool"),
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        self.outputs = {
+            'Moment1Out': moment1,
+            'Moment2Out': moment2,
+            'ParamOut': param,
+            'Beta1PowOut': self.inputs['Beta1Pow'],
+            'Beta2PowOut': self.inputs['Beta2Pow'],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamOpWithGlobalBetaPow(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.attrs = {'use_global_beta_pow': True}
+
+        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([]),
+            'Beta2PowOut': np.array([])
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestNet(unittest.TestCase):
+    def _test(self, run_mlu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+        if run_mlu:
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_mlu(self):
+        mlu_pred, mlu_loss = self._test(True)
+        cpu_pred, cpu_loss = self._test(False)
+        self.assertTrue(np.allclose(mlu_pred, cpu_pred, rtol=1e-3))
+        self.assertTrue(np.allclose(mlu_loss, cpu_loss, rtol=1e-3))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
new file mode 100644
index 0000000000000..d2827725a2058
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
@@ -0,0 +1,250 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_adam_op import adamw_step
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestAdamW(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adamw"
+        param = np.random.uniform(-1, 1, (105, 102)).astype("float32")
+        grad = np.random.uniform(-1, 1, (105, 102)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (105, 102)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((105, 102)).astype("float32")
+
+        learning_rate = 0.5
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {
+            'epsilon': epsilon,
+            'beta1': beta1,
+            'beta2': beta2,
+            "coeff": 0.9,
+            "with_decay": True
+        }
+
+        param_out, moment1_out, \
+            moment2_out = adamw_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamOpWithSkipUpdate(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adamw"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+            "SkipUpdate": np.array([True]).astype("bool"),
+        }
+
+        self.attrs = {'epsilon': epsilon, "coeff": 0.02, "with_decay": True}
+
+        self.outputs = {
+            'Moment1Out': moment1,
+            'Moment2Out': moment2,
+            'ParamOut': param,
+            'Beta1PowOut': self.inputs['Beta1Pow'],
+            'Beta2PowOut': self.inputs['Beta2Pow'],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamOpWithoutDecay(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adamw"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+            "SkipUpdate": np.array([True]).astype("bool"),
+        }
+
+        self.attrs = {'epsilon': epsilon, "coeff": 0.02, "with_decay": False}
+
+        self.outputs = {
+            'Moment1Out': moment1,
+            'Moment2Out': moment2,
+            'ParamOut': param,
+            'Beta1PowOut': self.inputs['Beta1Pow'],
+            'Beta2PowOut': self.inputs['Beta2Pow'],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestNet(unittest.TestCase):
+    def _test(self, run_mlu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            adam = paddle.optimizer.AdamW(learning_rate=0.01, weight_decay=0.02)
+            adam.minimize(loss)
+
+        if run_mlu:
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_mlu(self):
+        mlu_pred, mlu_loss = self._test(True)
+        cpu_pred, cpu_loss = self._test(False)
+        self.assertTrue(np.allclose(mlu_pred, cpu_pred, rtol=1e-3))
+        self.assertTrue(np.allclose(mlu_loss, cpu_loss, rtol=1e-3))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py
new file mode 100644
index 0000000000000..85302ad76da8b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py
@@ -0,0 +1,52 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestAssign(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "assign"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        self.outputs = {'Out': x}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py
new file mode 100644
index 0000000000000..5ee9d369e0fd9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py
@@ -0,0 +1,77 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import sys
+sys.path.append("..")
+
+import op_test
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+
+paddle.enable_static()
+numpy.random.seed(2022)
+
+
+class TestAssignValueMLUOp(op_test.OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "assign_value"
+        self.inputs = {}
+        self.attrs = {}
+        self.init_data()
+
+        self.attrs["shape"] = self.value.shape
+        self.attrs["dtype"] = framework.convert_np_dtype_to_dtype_(
+            self.value.dtype)
+        self.outputs = {"Out": self.value}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.float32)
+        self.attrs["fp32_values"] = [float(v) for v in self.value.flat]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestAssignValueMLUOp2(TestAssignValueMLUOp):
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32)
+        self.attrs["int32_values"] = [int(v) for v in self.value.flat]
+
+
+class TestAssignValueMLUOp3(TestAssignValueMLUOp):
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64)
+        self.attrs["int64_values"] = [int(v) for v in self.value.flat]
+
+
+class TestAssignValueMLUOp4(TestAssignValueMLUOp):
+    def init_data(self):
+        self.value = numpy.random.choice(
+            a=[False, True], size=(2, 5)).astype(numpy.bool)
+        self.attrs["bool_values"] = [int(v) for v in self.value.flat]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
new file mode 100644
index 0000000000000..8b32692020cbf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
@@ -0,0 +1,309 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.nn.functional as F
+from functools import reduce
+import sys
+sys.path.append('..')
+from op_test import _set_use_system_allocator
+from paddle.fluid import Program, program_guard
+from paddle.fluid.contrib.mixed_precision.fp16_utils import _keep_layer_norm_scale_bias_to_fp32
+from test_layer_norm_op import _reference_layer_norm_naive, _reference_layer_norm_grad
+
+paddle.enable_static()
+
+np.random.random(123)
+
+_set_use_system_allocator(True)
+
+
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.use_cudnn = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def check_forward_backward(self,
+                               shape,
+                               begin_norm_axis,
+                               has_scale=True,
+                               has_bias=True,
+                               y_grad_scale=1.0,
+                               use_mkldnn=False):
+        def test_with_place(place,
+                            shape,
+                            begin_norm_axis,
+                            use_mkldnn=use_mkldnn):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_scale else None
+            bias = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_bias else None
+            y_grad = (np.random.random_sample(x_shape) *
+                      y_grad_scale).astype(np.float32)
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
+            if has_scale:
+                var_names += ['scale']
+            if has_bias:
+                var_names += ['bias']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                inputs = {"X": block.var('x')}
+                fetch_list = [
+                    'y',
+                    'mean',
+                    'variance',
+                    'x@GRAD',
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var('scale')
+                    fetch_list += ['scale@GRAD']
+                if has_bias:
+                    inputs["Bias"] = block.var('bias')
+                    fetch_list += ['bias@GRAD']
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn
+                    })
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=fetch_list)
+
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(1 / np.sqrt(variance), out[2], "variance",
+                                    1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                if has_scale:
+                    self.__assert_close(scale_grad,
+                                        out[fetch_list.index('scale@GRAD')],
+                                        "scale_grad", 1e-3)
+                if has_bias:
+                    self.__assert_close(bias_grad,
+                                        out[fetch_list.index('bias@GRAD')],
+                                        "bias_grad")
+
+        test_with_place(self.place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=True)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=True,
+            has_bias=False)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+        self.check_forward_backward(
+            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1)
+        self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
+        self.check_forward_backward(
+            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=True,
+            y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=True,
+            has_bias=False,
+            y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=False,
+            y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True)
+
+
+class TestLayerNormAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.layers.data(
+            name='x',
+            shape=[64, 32, 256],
+            dtype='float32',
+            append_batch_size=False)
+        x = fluid.layers.layer_norm(
+            x,
+            scale=True,
+            shift=True,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr=None,
+            bias_attr=None)
+        x = fluid.layers.layer_norm(
+            x,
+            scale=False,
+            shift=False,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr=None,
+            bias_attr=None)
+        x = fluid.layers.layer_norm(
+            x,
+            scale=False,
+            shift=False,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr="scale",
+            bias_attr="shift")
+
+
+class TestDygraphLayerNormAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+
+            layer_norm = fluid.LayerNorm([32, 32])
+            # the input of LayerNorm must be Variable.
+            x1 = np.random.random((3, 32, 32)).astype('float32')
+            self.assertRaises(TypeError, layer_norm, x1)
+
+            # the input dtype of LayerNorm must be float32 or float16
+            x2 = fluid.layers.data(name='x2', shape=[3, 32, 32], dtype="int32")
+            self.assertRaises(TypeError, layer_norm, x2)
+
+
+class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
+    def check_main(self, x_np, weight_np, bias_np, dtype):
+        paddle.disable_static()
+
+        weight_np = weight_np.astype(dtype)
+        bias_np = bias_np.astype(dtype)
+
+        x = paddle.to_tensor(x_np)
+        weight = paddle.to_tensor(weight_np)
+        bias = paddle.to_tensor(bias_np)
+        x.stop_gradient = False
+        weight.stop_gradient = False
+        bias.stop_gradient = False
+        y = F.layer_norm(x, x.shape[1:], weight, bias)
+        x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
+        y_np = y.numpy().astype('float32')
+        x_g_np = x_g.numpy().astype('float32')
+        w_g_np = w_g.numpy().astype('float16')
+        b_g_np = b_g.numpy().astype('float32')
+
+        paddle.enable_static()
+        return y_np, x_g_np, w_g_np, b_g_np
+
+    def test_main(self):
+        x_np = np.random.random([10, 20]).astype('float16')
+        weight_np = np.random.random([20]).astype('float16')
+        bias_np = np.random.random([20]).astype('float16')
+
+        y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
+            x_np, weight_np, bias_np, 'float16')
+        y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
+            x_np, weight_np, bias_np, 'float32')
+
+        def assert_equal(x, y):
+            self.assertTrue(np.array_equal(x, y))
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+        assert_equal(w_g_np_1, w_g_np_2)
+        assert_equal(b_g_np_1, b_g_np_2)
+
+
+class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
+    def test_main(self):
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(False)
+        self.assertFalse(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(True)
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
new file mode 100644
index 0000000000000..44532ddceb765
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
@@ -0,0 +1,631 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle
+
+paddle.enable_static()
+
+
+# Situation 1: starts(list, no tensor), ends(list, no tensor)
+# 1.1 without attr(decrease)
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestCase1(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+
+class TestCase2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+
+# 1.2 with attr(decrease)
+class TestSliceOp_decs_dim(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1, 0, 2]
+        self.ends = [1000000, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[:, :, :, -1]
+
+
+class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+# Situation 2: starts(list, have tensor), ends(list, no tensor)
+# without attr(decrease)
+class TestSliceOp_starts_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+
+        starts_tensor = []
+        for index, ele in enumerate(self.starts):
+            starts_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int64') * ele))
+
+        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts_infer,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, 1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+        self.starts_infer = [-1, 0, -1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+# Situation 2: starts(list, have tensor), ends(list, no tensor)
+#  with attr(decrease)
+class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+
+        starts_tensor = []
+        for index, ele in enumerate(self.starts):
+            starts_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
+
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts_infer,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, -1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+        self.starts_infer = [1, -1, 2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestSliceOp_decs_dim_5_starts_ListTensor(
+        TestSliceOp_decs_dim_starts_ListTensor):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [-1]
+        self.out = self.input[:, :, :, -1]
+
+        self.starts_infer = [-1]
+
+
+# Situation 3: starts(tensor), ends(list, no tensor)
+# with attr(decrease)
+class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 4: starts(tensor), ends(tensor)
+# without attr(decrease)
+class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int64"),
+            "EndsTensor": np.array(
+                self.ends, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            #'ends': self.ends_infer,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 5: starts(tensor), ends(tensor)
+#  with attr(decrease)
+class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32"),
+            "EndsTensor": np.array(
+                self.ends, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            #'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1, 0, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 6: starts(tensor), ends(list, have tensor)
+# without attr(decrease)
+class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+
+        ends_tensor = []
+        for index, ele in enumerate(self.ends):
+            ends_tensor.append(("y" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32"),
+            'EndsTensorList': ends_tensor
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            'ends': self.ends_infer,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+        self.ends_infer = [-1, 3, 4]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Test float16
+class TestFP16(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+        self.infer_flags = [1, 1, 1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+class TestFP16_2(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 10]).astype(self.dtype)
+        self.starts = [0]
+        self.ends = [1]
+        self.axes = [1]
+        self.out = self.input[:, 0:1, :]
+        self.infer_flags = [1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            numeric_grad_delta=0.5)
+
+
+class TestSliceApiWithTensor(unittest.TestCase):
+    def test_starts_ends_is_tensor(self):
+        with paddle.fluid.dygraph.guard():
+            a = paddle.rand(shape=[4, 5, 6], dtype='float32')
+            axes = [0, 1, 2]
+            starts = [-3, 0, 2]
+            ends = [3, 2, 4]
+            a_1 = paddle.slice(
+                a,
+                axes=axes,
+                starts=paddle.to_tensor(
+                    starts, dtype='int32'),
+                ends=paddle.to_tensor(
+                    ends, dtype='int32'))
+            a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends)
+
+            self.assertTrue(np.array_equal(a_1.numpy(), a_2.numpy()))
+
+    def test_bool_tensor(self):
+        with paddle.fluid.dygraph.guard():
+            array = (np.arange(60).reshape([3, 4, 5]) % 3).astype('bool')
+            tt = paddle.to_tensor(array)
+            tt.stop_gradient = False
+
+            starts = [0, 1, 2]
+            ends = [3, 5, 4]
+            axes = [0, 1, 2]
+
+            y_paddle = paddle.slice(tt, axes, starts, ends)
+            y_np = tt[0:3, 1:5, 2:4]
+
+            self.assertTrue(paddle.bool == y_paddle.dtype)
+            self.assertTrue(np.array_equal(y_paddle.numpy(), y_np))
+
+
+class TestImperativeVarBaseGetItem(unittest.TestCase):
+    def test_getitem_with_long(self):
+        with fluid.dygraph.guard():
+            data = np.random.random((2, 80, 16128)).astype('float32')
+            var = fluid.dygraph.to_variable(data)
+            sliced = var[:, 10:, :var.shape[1]]  # var.shape[1] is 80L here
+            self.assertEqual(sliced.shape, [2, 70, 80])
+
+            sliced = var[:, var.shape[0]:, var.shape[0]:var.shape[1]]
+            self.assertEqual(sliced.shape, [2, 78, 78])
+
+    def test_getitem_with_float(self):
+        def test_float_in_slice_item():
+            with fluid.dygraph.guard():
+                data = np.random.random((2, 80, 16128)).astype('float32')
+                var = fluid.dygraph.to_variable(data)
+                sliced = var[:, 1.1:, :var.shape[1]]
+
+        self.assertRaises(Exception, test_float_in_slice_item)
+
+        def test_float_in_index():
+            with fluid.dygraph.guard():
+                data = np.random.random((2, 80, 16128)).astype('float32')
+                var = fluid.dygraph.to_variable(data)
+                sliced = var[1.1]
+
+        self.assertRaises(Exception, test_float_in_index)
+
+
+class TestInferShape(unittest.TestCase):
+    def test(self):
+        x = paddle.ones(shape=[3, 4, 5])
+        x.desc.set_shape([3, -1, 5])
+        self.assertEqual(x.shape, (3, -1, 5))
+
+        out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
+        self.assertEqual(out0.shape, (3, 3, 5))
+
+    def test_axis_less_than_zero(self):
+
+        # Using paddle.disable_static will make other unittests fail.
+        with fluid.dygraph.guard():
+            x_arr = np.arange(0, 24, dtype=np.float32).reshape([2, 3, 4])
+            x = paddle.to_tensor(x_arr)
+
+            pp_slice = paddle.slice(x, [100, ], [0], [1])
+            np_slice = x_arr[:, :, 0:1]
+            self.assertTrue(np.array_equal(pp_slice, np_slice))
+
+            pp_slice = paddle.slice(x, (-100, ), [0], [1])
+            np_slice = x_arr[0:1]
+            self.assertTrue(np.array_equal(pp_slice, np_slice))
+
+            x_arr = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0)))
+
+            starts = paddle.to_tensor(
+                np.reshape(
+                    np.array(
+                        [], dtype=np.int32), (0, )))
+            ends = paddle.to_tensor(
+                np.reshape(
+                    np.array(
+                        [], dtype=np.int32), (0, )))
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [-1000000], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [1000000], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, 0, starts, ends)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py
new file mode 100644
index 0000000000000..28833a7dc1dcc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import numpy as np
+import os
+
+paddle.enable_static()
+
+
+def sample_output_one_dimension(out, dim):
+    # count numbers of different categories
+    sample_prob = np.zeros(dim).astype("float32")
+    sample_index_prob = np.unique(out, return_counts=True)
+    sample_prob[sample_index_prob[0]] = sample_index_prob[1]
+    sample_prob /= sample_prob.sum()
+    return sample_prob
+
+
+def sample_output_two_dimension(out, shape):
+    num_dist = shape[0]
+    out_list = np.split(out, num_dist, axis=0)
+    sample_prob = np.zeros(shape).astype("float32")
+    for i in range(num_dist):
+        sample_index_prob = np.unique(out_list[i], return_counts=True)
+        sample_prob[i][sample_index_prob[0]] = sample_index_prob[1]
+    sample_prob /= sample_prob.sum(axis=-1, keepdims=True)
+    return sample_prob
+
+
+class TestMultinomialOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "multinomial"
+        self.init_data()
+        self.inputs = {"X": self.input_np}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_data(self):
+        # input probability is a vector, and replacement is True
+        self.input_np = np.random.rand(4)
+        self.outputs = {"Out": np.zeros(100000).astype("int64")}
+        self.attrs = {"num_samples": 100000, "replacement": True}
+
+    def test_check_output(self):
+        self.check_output_customized(
+            self.verify_output, custom_place=self.place)
+
+    def sample_output(self, out):
+        return sample_output_one_dimension(out, 4)
+
+    def verify_output(self, outs):
+        # normalize the input to get the probability
+        prob = self.input_np / self.input_np.sum(axis=-1, keepdims=True)
+        sample_prob = self.sample_output(np.array(outs[0]))
+        self.assertTrue(
+            np.allclose(
+                sample_prob, prob, rtol=0, atol=0.01),
+            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
+
+
+class TestMultinomialOp2(TestMultinomialOp):
+    def init_data(self):
+        # input probability is a matrix
+        self.input_np = np.random.rand(3, 4)
+        self.outputs = {"Out": np.zeros((3, 100000)).astype("int64")}
+        self.attrs = {"num_samples": 100000, "replacement": True}
+
+    def sample_output(self, out):
+        return sample_output_two_dimension(out, [3, 4])
+
+
+class TestMultinomialOp3(TestMultinomialOp):
+    def init_data(self):
+        # replacement is False. number of samples must be less than number of categories.
+        self.input_np = np.random.rand(1000)
+        self.outputs = {"Out": np.zeros(100).astype("int64")}
+        self.attrs = {"num_samples": 100, "replacement": False}
+
+    def verify_output(self, outs):
+        out = np.array(outs[0])
+        unique_out = np.unique(out)
+        self.assertEqual(
+            len(unique_out), 100,
+            "replacement is False. categories can't be sampled repeatedly")
+
+
+class TestMultinomialApi(unittest.TestCase):
+    def test_dygraph(self):
+        # input probability is a vector, and replacement is True
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+        x_numpy = np.random.rand(4)
+        x = paddle.to_tensor(x_numpy)
+        out = paddle.multinomial(x, num_samples=100000, replacement=True)
+
+        sample_prob = sample_output_one_dimension(out.numpy(), 4)
+        prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
+        self.assertTrue(
+            np.allclose(
+                sample_prob, prob, rtol=0, atol=0.01),
+            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
+        paddle.enable_static()
+
+    def test_dygraph2(self):
+        # input probability is a matrix, and replacement is True
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+        x_numpy = np.random.rand(3, 4)
+        x = paddle.to_tensor(x_numpy)
+        out = paddle.multinomial(x, num_samples=100000, replacement=True)
+
+        sample_prob = sample_output_two_dimension(out.numpy(), [3, 4])
+        prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
+        self.assertTrue(
+            np.allclose(
+                sample_prob, prob, rtol=0, atol=0.01),
+            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
+        paddle.enable_static()
+
+    def test_dygraph3(self):
+        # replacement is False. number of samples must be less than number of categories.
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+        x_numpy = np.random.rand(1000)
+        x = paddle.to_tensor(x_numpy)
+        out = paddle.multinomial(x, num_samples=100, replacement=False)
+
+        unique_out = np.unique(out.numpy())
+        self.assertEqual(
+            len(unique_out), 100,
+            "replacement is False. categories can't be sampled repeatedly")
+        paddle.enable_static()
+
+    def test_dygraph4(self):
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+        logits = -1 * paddle.ones([2800])
+        # Categorical.sample API will call multinomial op with replacement=True
+        cat = paddle.distribution.Categorical(logits.exp())
+        cat.sample([1])
+        paddle.enable_static()
+
+    def test_static(self):
+        paddle.set_device('npu:0')
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.data('x', shape=[4], dtype='float32')
+            out = paddle.multinomial(x, num_samples=100000, replacement=True)
+
+            place = fluid.NPUPlace(0)
+            exe = fluid.Executor(place)
+
+        exe.run(startup_program)
+        x_np = np.random.rand(4).astype('float32')
+        out = exe.run(train_program, feed={'x': x_np}, fetch_list=[out])
+
+        sample_prob = sample_output_one_dimension(out, 4)
+        prob = x_np / x_np.sum(axis=-1, keepdims=True)
+        self.assertTrue(
+            np.allclose(
+                sample_prob, prob, rtol=0, atol=0.01),
+            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
+
+
+class TestMultinomialAlias(unittest.TestCase):
+    def test_alias(self):
+        paddle.set_device('npu:0')
+        x = paddle.rand([4])
+        out1 = paddle.multinomial(x, num_samples=10, replacement=True)
+        out2 = paddle.tensor.multinomial(x, num_samples=10, replacement=True)
+        out3 = paddle.tensor.random.multinomial(
+            x, num_samples=10, replacement=True)
+
+
+class TestMultinomialError(unittest.TestCase):
+    def setUp(self):
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_num_sample(self):
+        def test_num_sample_less_than_0():
+            x = paddle.rand([4])
+            out = paddle.multinomial(x, num_samples=-2)
+
+        self.assertRaises(ValueError, test_num_sample_less_than_0)
+
+    def test_input_probs_dim(self):
+        def test_dim_larger_than_2():
+            x = paddle.rand([2, 3, 3])
+            out = paddle.multinomial(x)
+
+        self.assertRaises(ValueError, test_dim_larger_than_2)
+
+        def test_dim_less_than_1():
+            x_np = np.random.random([])
+            x = paddle.to_tensor(x_np)
+            out = paddle.multinomial(x)
+
+        self.assertRaises(ValueError, test_dim_less_than_1)
+
+        with self.assertRaises(ValueError):
+            prob = paddle.rand([20, 1000])
+            prob[1:0] = 0
+            out = paddle.multinomial(prob)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 13c72bedefa8e..f7a3dfa1102b2 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -872,7 +872,7 @@ def cal_python_api(python_api, args, kernel_sig):
             eager_tensor_outputs = egr_oups if egr_oups else self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
@@ -906,7 +906,7 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
@@ -2016,7 +2016,7 @@ def _get_dygraph_grad(self,
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
index cc40b89b585cb..bf33adcf48655 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
@@ -25,7 +25,7 @@ def _compare_result_with_origin_model(self,
                                           check_func,
                                           use_device,
                                           delta2=1e-5,
-                                          compare_seperately=True):
+                                          compare_separately=True):
         if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
@@ -45,7 +45,7 @@ def _compare_result_with_origin_model(self,
             batch_size=seresnext_net.batch_size(use_device),
             use_device=use_device)
 
-        if compare_seperately:
+        if compare_separately:
             for loss in zip(func_1_first_loss, func_2_first_loss):
                 self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
             for loss in zip(func_1_last_loss, func_2_last_loss):
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index b57f26776234e..4dc3fe6eab6be 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -69,9 +69,9 @@ def __init__(self,
         super(ParallelFusedMultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
index 5f467da6a6465..ad570fc0acfb3 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -172,10 +172,10 @@ def __init__(self,
                  name=None):
         super(ParallelFusedFeedForward, self).__init__()
         assert d_model > 0, (
-            "Expected d_model to be greater than 0, but recieved {}".format(
+            "Expected d_model to be greater than 0, but received {}".format(
                 d_model))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self._dtype = self._helper.get_default_dtype()
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index d2eef785f6e07..3e2f112e964bb 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -54,8 +54,8 @@ def adamw_step(inputs, attributes):
 
     moment1_out = beta1 * moment1 + (1 - beta1) * grad
     moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
-    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
-    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
+    denom = (np.sqrt(moment2_out) / np.sqrt(1.0 - beta2_pow)) + epsilon
+    param_out = param + ((moment1_out / denom) * (-(lr / (1.0 - beta1_pow))))
     return param_out, moment1_out, moment2_out
 
 
@@ -314,16 +314,16 @@ def simple_lr_setting(param, decay_rate, n_layers):
                  "core is not compiled with CUDA")
 class TestAdamWOpLayerwiseLR(TestAdamWOp):
     def setUp(self):
-        random.seed(2021)
-        np.random.seed(2021)
-        paddle.seed(2021)
+        random.seed(2022)
+        np.random.seed(2022)
+        paddle.seed(2022)
 
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
-        value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_tensor(value)
-        linear1 = paddle.nn.Linear(13, 8)
-        linear2 = paddle.nn.Linear(8, 5)
+        linear1 = paddle.nn.Linear(
+            13, 8, bias_attr=paddle.nn.initializer.Constant(value=1.0))
+        linear2 = paddle.nn.Linear(
+            8, 5, bias_attr=paddle.nn.initializer.Constant(value=1.0))
 
         # fix the linear name, simple_lr_setting function will use the name
         linear1.weight.name = "linear_1.w_0"
@@ -331,33 +331,103 @@ def test_adamw_op_dygraph(self):
         linear2.weight.name = "linear_2.w_0"
         linear2.bias.name = "linear_2.b_0"
 
+        fc1_w = np.array(linear1.weight)
+        fc1_w_mon1 = np.zeros_like(fc1_w)
+        fc1_w_mon2 = np.zeros_like(fc1_w)
+        fc1_b = np.array(linear1.bias)
+        fc1_b_mon1 = np.zeros_like(fc1_b)
+        fc1_b_mon2 = np.zeros_like(fc1_b)
+
+        fc2_w = np.array(linear2.weight)
+        fc2_w_mon1 = np.zeros_like(fc2_w)
+        fc2_w_mon2 = np.zeros_like(fc2_w)
+        fc2_b = np.array(linear2.bias)
+        fc2_b_mon1 = np.zeros_like(fc2_b)
+        fc2_b_mon2 = np.zeros_like(fc2_b)
+
         simple_lr_fun = partial(simple_lr_setting, decay_rate=0.8, n_layers=2)
+        learning_rate = 0.001
+        weight_decay = 0.01
+        beta1 = 0.9
+        beta2 = 0.999
 
-        adam = paddle.optimizer.AdamW(
-            learning_rate=0.01,
+        opt = paddle.optimizer.AdamW(
+            learning_rate=learning_rate,
             parameters=[{
                 'params': linear1.parameters()
             }, {
                 'params': linear2.parameters(),
             }],
             apply_decay_param_fun=lambda name: True,
-            weight_decay=0.01,
+            weight_decay=weight_decay,
             lr_ratio=simple_lr_fun)
 
-        loss_ref = np.array(
-            [-1.7267396, -2.81524, -3.9250019, -5.05954, -6.2272625])
+        def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
+            np_inputs = {
+                'Param': param,
+                'Grad': grad,
+                'Moment1': moment1,
+                'Moment2': moment2,
+                'LearningRate': np.array([learning_rate]).astype("float32"),
+                'Beta1Pow': np.array([beta1**t]).astype("float32"),
+                'Beta2Pow': np.array([beta2**t]).astype("float32")
+            }
+
+            np_attrs = {
+                'epsilon': 1e-8,
+                'beta1': beta1,
+                'beta2': beta2,
+                "lr_ratio": lr_ratio,
+                "coeff": weight_decay,
+                "with_decay": True
+            }
+            param_out, moment1_out, moment2_out = adamw_step(np_inputs,
+                                                             np_attrs)
+            return param_out, moment1_out, moment2_out
+
         for i in range(5):
+            a = paddle.to_tensor(
+                np.random.uniform(-1, 1, (2, 13)).astype("float32"))
             a1 = linear1(a)
             out = linear2(a1)
             out = paddle.mean(out)
             out.backward()
-            adam.step()
-            adam.clear_gradients()
-            np.testing.assert_allclose(out[0].numpy(), loss_ref[i], rtol=1e-6)
+
+            fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
+                fc1_w,
+                np.array(linear1.weight.grad), fc1_w_mon1, fc1_w_mon2,
+                simple_lr_fun(linear1.weight), i + 1)
+            fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
+                fc1_b,
+                np.array(linear1.bias.grad), fc1_b_mon1, fc1_b_mon2,
+                simple_lr_fun(linear1.bias), i + 1)
+            fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
+                fc2_w,
+                np.array(linear2.weight.grad), fc2_w_mon1, fc2_w_mon2,
+                simple_lr_fun(linear2.weight), i + 1)
+            fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
+                fc2_b,
+                np.array(linear2.bias.grad), fc2_b_mon1, fc2_b_mon2,
+                simple_lr_fun(linear2.bias), i + 1)
+
+            opt.step()
+            opt.clear_gradients()
+
+            np.testing.assert_allclose(linear1.weight.numpy(), fc1_w, rtol=1e-6)
+            np.testing.assert_allclose(linear1.bias.numpy(), fc1_b, rtol=1e-6)
+            np.testing.assert_allclose(linear2.weight.numpy(), fc2_w, rtol=1e-6)
+            np.testing.assert_allclose(linear2.bias.numpy(), fc2_b, rtol=1e-6)
 
     def test_adamw_op(self):
         paddle.enable_static()
         place = fluid.CUDAPlace(0)
+
+        learning_rate = 0.0001
+        beta1 = 0.85
+        beta2 = 0.95
+        weight_decay = 0.01
+        epsilon = 1e-8
+
         train_prog = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(train_prog, startup):
@@ -365,42 +435,121 @@ def test_adamw_op(self):
                 x = fluid.data(name='x', shape=[None, 10], dtype='float32')
                 y = fluid.data(name='y', shape=[None, 1], dtype='float32')
 
-                fc1 = fluid.layers.fc(input=x, size=32, act=None)
-                prediction = fluid.layers.fc(input=fc1, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=prediction, label=y)
+                weight_attr1 = paddle.framework.ParamAttr(name="linear_0.w_0")
+                bias_attr1 = paddle.framework.ParamAttr(
+                    name="linear_0.b_0",
+                    initializer=paddle.nn.initializer.Constant(value=1.0))
+                weight_attr2 = paddle.framework.ParamAttr(name="linear_1.w_0")
+                bias_attr2 = paddle.framework.ParamAttr(
+                    name="linear_1.b_0",
+                    initializer=paddle.nn.initializer.Constant(value=1.0))
+                linear1 = paddle.nn.Linear(
+                    10, 32, weight_attr=weight_attr1, bias_attr=bias_attr1)
+                linear2 = paddle.nn.Linear(
+                    32, 1, weight_attr=weight_attr2, bias_attr=bias_attr2)
+
+                out = linear1(x)
+                out = linear2(out)
+
+                fc1_w_mon1 = np.zeros((linear1.weight.shape)).astype("float32")
+                fc1_w_mon2 = np.zeros((linear1.weight.shape)).astype("float32")
+                fc1_b_mon1 = np.zeros((linear1.bias.shape)).astype("float32")
+                fc1_b_mon2 = np.zeros((linear1.bias.shape)).astype("float32")
+                fc2_w_mon1 = np.zeros((linear2.weight.shape)).astype("float32")
+                fc2_w_mon2 = np.zeros((linear2.weight.shape)).astype("float32")
+                fc2_b_mon1 = np.zeros((linear2.bias.shape)).astype("float32")
+                fc2_b_mon2 = np.zeros((linear2.bias.shape)).astype("float32")
+
+                cost = fluid.layers.square_error_cost(input=out, label=y)
                 avg_cost = fluid.layers.mean(cost)
 
                 simple_lr_fun = partial(
                     simple_lr_setting, decay_rate=0.8, n_layers=2)
 
-                beta1 = fluid.layers.create_global_var(
-                    shape=[1], value=0.85, dtype='float32', persistable=True)
-                beta2 = fluid.layers.create_global_var(
-                    shape=[1], value=0.95, dtype='float32', persistable=True)
-                betas = [beta1, beta2]
                 opt = paddle.optimizer.AdamW(
-                    learning_rate=1e-5,
+                    learning_rate=learning_rate,
                     beta1=beta1,
                     beta2=beta2,
-                    weight_decay=0.01,
-                    epsilon=1e-8,
+                    weight_decay=weight_decay,
+                    epsilon=epsilon,
                     lr_ratio=simple_lr_fun)
                 opt.minimize(avg_cost)
 
+        def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
+            np_inputs = {
+                'Param': param,
+                'Grad': grad,
+                'Moment1': moment1,
+                'Moment2': moment2,
+                'LearningRate': np.array([learning_rate]).astype("float32"),
+                'Beta1Pow': np.array([beta1**t]).astype("float32"),
+                'Beta2Pow': np.array([beta2**t]).astype("float32")
+            }
+
+            np_attrs = {
+                'epsilon': epsilon,
+                'beta1': beta1,
+                'beta2': beta2,
+                "lr_ratio": lr_ratio,
+                "coeff": weight_decay,
+                "with_decay": True
+            }
+            param_out, moment1_out, moment2_out = adamw_step(np_inputs,
+                                                             np_attrs)
+            return param_out, moment1_out, moment2_out
+
+        fetch_list1 = [
+            "linear_0.w_0", "linear_0.b_0", "linear_1.w_0", "linear_1.b_0"
+        ]
+        fetch_list2 = [
+            "linear_0.w_0", "linear_0.w_0@GRAD", "linear_0.b_0",
+            "linear_0.b_0@GRAD", "linear_1.w_0", "linear_1.w_0@GRAD",
+            "linear_1.b_0", "linear_1.b_0@GRAD"
+        ]
+
         exe = fluid.Executor(place)
         exe.run(startup)
+        test_prog = train_prog.clone(for_test=True)
 
-        loss_ref = np.array(
-            [0.33895183, 0.3159437, 0.19472016, 0.17764759, 0.1520702])
         for i in range(5):
             inputs = np.random.random(size=[8, 10]).astype('float32')
             outputs = np.random.random(size=[8, 1]).astype('float32')
-            rets = exe.run(train_prog,
-                           feed={"x": inputs,
-                                 "y": outputs},
-                           fetch_list=[avg_cost])
-            assert rets[0] is not None
-            np.testing.assert_allclose(rets[0], loss_ref[i], rtol=1e-6)
+
+            param = exe.run(test_prog,
+                            feed={"x": inputs,
+                                  "y": outputs},
+                            fetch_list=fetch_list1)
+            params_and_gras = exe.run(train_prog,
+                                      feed={"x": inputs,
+                                            "y": outputs},
+                                      fetch_list=fetch_list2)
+
+            fc1_w = param[0]
+            fc1_w_grad = params_and_gras[1]
+            fc1_b = param[1]
+            fc1_b_grad = params_and_gras[3]
+            fc2_w = param[2]
+            fc2_w_grad = params_and_gras[5]
+            fc2_b = param[3]
+            fc2_b_grad = params_and_gras[7]
+
+            fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
+                fc1_w, fc1_w_grad, fc1_w_mon1, fc1_w_mon2,
+                simple_lr_fun(linear1.weight), i + 1)
+            fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
+                fc1_b, fc1_b_grad, fc1_b_mon1, fc1_b_mon2,
+                simple_lr_fun(linear1.bias), i + 1)
+            fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
+                fc2_w, fc2_w_grad, fc2_w_mon1, fc2_w_mon2,
+                simple_lr_fun(linear2.weight), i + 1)
+            fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
+                fc2_b, fc2_b_grad, fc2_b_mon1, fc2_b_mon2,
+                simple_lr_fun(linear2.bias), i + 1)
+
+            np.testing.assert_allclose(params_and_gras[0], fc1_w, rtol=1e-6)
+            np.testing.assert_allclose(params_and_gras[2], fc1_b, rtol=1e-6)
+            np.testing.assert_allclose(params_and_gras[4], fc2_w, rtol=1e-6)
+            np.testing.assert_allclose(params_and_gras[6], fc2_b, rtol=1e-6)
 
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
index a4ef15b1f0db3..b9e9224b9e402 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
@@ -27,6 +27,9 @@
 
 
 class TestConvTransposeDoubleGradCheck(unittest.TestCase):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(x[0], x[1], groups=1)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 3, 3]
@@ -55,6 +58,11 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
     def test_grad(self):
         places = []
@@ -67,6 +75,10 @@ def test_grad(self):
 
 class TestConvTranspose2DoubleGradCheck_AsyPadding(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding=[1, 0, 0, 1])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -100,10 +112,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingSAME(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding="SAME")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -137,10 +158,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingVALID(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding="VALID")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -174,10 +204,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_ChannelLast(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding=[1, 1], data_format="NHWC")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 3, 2]
@@ -213,6 +252,11 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 348945b73e1a4..5ef5a1016cc8b 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -24,6 +24,7 @@
 import numpy as np
 import os
 import shutil
+import tempfile
 import unittest
 
 
@@ -82,12 +83,17 @@ def test_run_with_dump(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        with open("test_run_with_dump_a.txt", "w") as f:
+
+        temp_dir = tempfile.TemporaryDirectory()
+        dump_a_path = os.path.join(temp_dir.name, 'test_run_with_dump_a.txt')
+        dump_b_path = os.path.join(temp_dir.name, 'test_run_with_dump_b.txt')
+
+        with open(dump_a_path, "w") as f:
             data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_run_with_dump_b.txt", "w") as f:
+        with open(dump_b_path, "w") as f:
             data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -110,8 +116,7 @@ def test_run_with_dump(self):
             parse_content=True,
             fea_eval=True,
             candidate_size=10000)
-        dataset.set_filelist(
-            ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
+        dataset.set_filelist([dump_a_path, dump_b_path])
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -129,8 +134,7 @@ def test_run_with_dump(self):
             except Exception as e:
                 self.assertTrue(False)
 
-        os.remove("./test_run_with_dump_a.txt")
-        os.remove("./test_run_with_dump_b.txt")
+        temp_dir.cleanup()
 
     def test_dataset_config(self):
         """ Testcase for dataset configuration. """
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
index 5911ada1817b6..911bee69e8b77 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
@@ -25,6 +25,7 @@
 import math
 import os
 import shutil
+import tempfile
 import unittest
 import paddle.fluid.incubate.data_generator as dg
 
@@ -282,7 +283,11 @@ def test_var_consistency_insepection(self):
         """
         Testcase for InMemoryDataset of consistency insepection of use_var_list and data_generator.
         """
-        with open("test_run_with_dump_a.txt", "w") as f:
+
+        temp_dir = tempfile.TemporaryDirectory()
+        dump_a_path = os.path.join(temp_dir.name, 'test_run_with_dump_a.txt')
+
+        with open(dump_a_path, "w") as f:
             # data = "\n"
             # data += "\n"
             data = "2 1;1 9;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;0;40000001;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000200;10000200;10063938;10000008;10000177;20002001 20001240 20001860 20003611 20010833 20000210 20000500 20000401 20000251 20012198 20001023 20000157;20002001 20001240 20001860 20003611 20012396 20000500 20002513 20012198 20001023 20000157;10000123;30000004;0.623 0.233 0.290 0.208 0.354 49.000 0.000 0.000 0.000 -1.000 0.569 0.679 0.733 53 17 2 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000047;30000004;0.067 0.000 0.161 0.005 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.378 0.043 0 6 0 0;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;10000200;30000001;0.407 0.111 0.196 0.095 0.181 49.000 0.000 0.000 0.000 -1.000 0.306 0.538 0.355 48 8 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;10000200;30000001;0.226 0.029 0.149 0.031 0.074 49.000 0.000 0.000 0.000 -1.000 0.220 0.531 0.286 26 6 0 0;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;10063938;30000001;0.250 0.019 0.138 0.012 0.027 49.000 0.000 0.000 0.000 -1.000 0.370 0.449 0.327 7 2 0 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000003;30000002;0.056 0.000 0.139 0.003 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.346 0.059 15 3 0 0;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;10000008;30000001;0.166 0.004 0.127 0.001 0.004 49.000 0.000 0.000 0.000 -1.000 0.103 0.417 0.394 10 3 0 0;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000177;30000001;0.094 0.008 0.157 0.012 0.059 49.000 0.000 0.000 0.000 -1.000 0.051 0.382 0.142 21 0 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;10000134;30000001;0.220 0.016 0.181 0.037 0.098 49.000 0.000 0.000 0.000 -1.000 0.192 0.453 0.199 17 1 0 0;20002001 20001240 20001860 20003611 20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000638;30000001;0.000 0.000 0.000 0.000 0.000 49.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0 0 0 0;\n"
@@ -348,7 +353,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=0)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
             print("case 1: check passed!")
         except Exception as e:
             print("warning: catch expected error")
@@ -360,7 +365,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=2)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 2 catch expected error")
             print(e)
@@ -371,7 +376,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=3)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 3 catch expected error")
             print(e)
@@ -382,7 +387,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=4)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 4 catch expected error")
             print(e)
@@ -393,13 +398,13 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=5)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 5 catch expected error")
             print(e)
         print("========================================")
 
-        os.remove("./test_run_with_dump_a.txt")
+        temp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 11972059c832c..4f21b3220a9d3 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -1461,6 +1461,7 @@ def check_with_place(self,
                          need_envs={},
                          log_name=""):
         if self._dygraph and (self._gloo_mode or self._nccl2_mode):
+            need_envs.update({"FLAGS_enable_eager_mode": "1"})
             with _test_eager_guard():
                 self.check_with_place_func(
                     model_file=model_file,
@@ -1468,6 +1469,7 @@ def check_with_place(self,
                     check_error_log=check_error_log,
                     need_envs=need_envs,
                     log_name=log_name)
+            need_envs.update({"FLAGS_enable_eager_mode": "0"})
             self.check_with_place_func(
                 model_file=model_file,
                 delta=delta,
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
index e664face0483a..0a51045dee5e1 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import paddle.fluid as fluid
 
@@ -24,9 +25,10 @@ class TestDygraphGroupSharded(TestMultipleGpus):
 
     # check group sharded logic as well as the accuracy with single mode
     def test_dygraph_group_sharded(self):
-        self.run_mnist_2gpu('dygraph_group_sharded_api.py')
+        self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False)
         self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py')
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
index 0be455591bf93..50e1985138610 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
@@ -14,7 +14,6 @@
 
 from __future__ import print_function
 
-import os
 import unittest
 import paddle.fluid as fluid
 
@@ -30,5 +29,4 @@ def test_dygraph_sharding_optimizer_stage2(self):
 
 
 if __name__ == "__main__":
-    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index 63acaf6396913..c58d46edde753 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -464,5 +464,19 @@ def test_static_graph(self):
             self.check_output_equal(a, e)
 
 
+class TestStaticGraphShape(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+    def tearDown(self):
+        paddle.disable_static()
+
+    def test_shape(self):
+        A = paddle.static.data(name='x', shape=[-1])
+        B = paddle.static.data(name='y', shape=[384])
+        C = paddle.einsum('i,d->id', A, B)
+        self.assertEqual(C.shape, (-1, 384))
+
+
 if __name__ == "__main__":
-    u
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py
new file mode 100644
index 0000000000000..8a8e74e28ec72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_heaviside"
+        x = np.random.random((13, 17)).astype("float64")
+        y = np.random.random((13, 17)).astype("float64")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.heaviside(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
+class TestHeavisideBroadcast(unittest.TestCase):
+    def setUp(self):
+        self.input_1 = np.random.rand(2, 100, 13, 17).astype("float32")
+        self.input_2 = np.random.rand(100, 13, 17).astype("float32")
+        self.input_3 = np.random.rand(100, 13, 1).astype("float32")
+        self.input_4 = np.random.rand(13, 17).astype("float32")
+        self.input_5 = np.random.rand(1).astype("float32")
+
+        self.np_expected1 = np.heaviside(self.input_1, self.input_2)
+        self.np_expected2 = np.heaviside(self.input_2, self.input_3)
+        self.np_expected3 = np.heaviside(self.input_2, self.input_4)
+        self.np_expected4 = np.heaviside(self.input_4, self.input_5)
+
+    def test_broadcast(self):
+        paddle.disable_static()
+        self.tensor_1 = paddle.to_tensor(self.input_1)
+        self.tensor_2 = paddle.to_tensor(self.input_2)
+        self.tensor_3 = paddle.to_tensor(self.input_3)
+        self.tensor_4 = paddle.to_tensor(self.input_4)
+        self.tensor_5 = paddle.to_tensor(self.input_5)
+
+        res = paddle.heaviside(self.tensor_1, self.tensor_2)
+        res = res.numpy()
+        self.assertTrue(np.allclose(res, self.np_expected1))
+
+        res = paddle.heaviside(self.tensor_2, self.tensor_3)
+        res = res.numpy()
+        self.assertTrue(np.allclose(res, self.np_expected2))
+
+        res = paddle.heaviside(self.tensor_2, self.tensor_4)
+        res = res.numpy()
+        self.assertTrue(np.allclose(res, self.np_expected3))
+
+        res = paddle.heaviside(self.tensor_4, self.tensor_5)
+        res = res.numpy()
+        self.assertTrue(np.allclose(res, self.np_expected4))
+
+
+class TestHeavisideAPI_float64(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random((13, 17)).astype("float64")
+        self.y_np = np.random.random((13, 17)).astype("float64")
+        self.out_np = np.heaviside(self.x_np, self.y_np)
+        self.dtype = "float64"
+
+    def test_static(self):
+        for use_cuda in ([False, True]
+                         if paddle.device.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            prog = paddle.static.Program()
+            with paddle.static.program_guard(prog):
+                x = paddle.static.data(
+                    name=f"x_{self.dtype}", shape=[13, 17], dtype=self.dtype)
+                y = paddle.static.data(
+                    name=f"y_{self.dtype}", shape=[13, 17], dtype=self.dtype)
+                out = paddle.heaviside(x, y)
+
+            exe = paddle.static.Executor(place=place)
+            res = exe.run(prog,
+                          feed={
+                              f"x_{self.dtype}": self.x_np,
+                              f"y_{self.dtype}": self.y_np
+                          },
+                          fetch_list=out,
+                          use_prune=True)
+
+            self.assertTrue(np.allclose(res, self.out_np))
+
+    def test_dygraph(self):
+        for use_cuda in ([False, True]
+                         if paddle.device.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            result = paddle.heaviside(
+                paddle.to_tensor(self.x_np), paddle.to_tensor(self.y_np))
+
+            self.assertTrue(np.allclose(result.numpy(), self.out_np))
+
+
+class TestHeavisideAPI_float32(TestHeavisideAPI_float64):
+    def setUp(self):
+        self.x_np = np.random.random((13, 17)).astype("float32")
+        self.y_np = np.random.random((13, 17)).astype("float32")
+        self.out_np = np.heaviside(self.x_np, self.y_np)
+        self.dtype = "float32"
+
+
+class TestHeavisideAPI_int64(TestHeavisideAPI_float64):
+    def setUp(self):
+        self.x_np = np.random.random((13, 17)).astype("int64")
+        self.y_np = np.random.random((13, 17)).astype("int64")
+        self.out_np = np.heaviside(self.x_np, self.y_np)
+        self.dtype = "int64"
+
+
+class TestHeavisideAPI_int32(TestHeavisideAPI_float64):
+    def setUp(self):
+        self.x_np = np.random.random((13, 17)).astype("int32")
+        self.y_np = np.random.random((13, 17)).astype("int32")
+        self.out_np = np.heaviside(self.x_np, self.y_np)
+        self.dtype = "int32"
+
+
+class TestHeavisideError(unittest.TestCase):
+    def test_input(self):
+        paddle.disable_static()
+
+        def test_input_x():
+            paddle.heaviside(1, paddle.randn([100]))
+
+        self.assertRaises(ValueError, test_input_x)
+
+        def test_input_y():
+            paddle.heaviside(paddle.randn([100]), 1)
+
+        self.assertRaises(ValueError, test_input_y)
+
+        def test_input_xy():
+            paddle.heaviside(
+                paddle.randn([100], 'float32'), paddle.randn([100], 'float64'))
+
+        self.assertRaises(ValueError, test_input_xy)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_empty_op.py b/python/paddle/fluid/tests/unittests/test_empty_op.py
index b8ff66a910ece..371c59a1b8cce 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_op.py
@@ -232,28 +232,33 @@ def test_static_graph(self):
             name="shape_tensor_int32", shape=[2], dtype="int32")
         shape_tensor_int64 = fluid.data(
             name="shape_tensor_int64", shape=[2], dtype="int64")
+        shape_tensor_unknown = fluid.data(
+            name="shape_tensor_unknown", shape=[-1], dtype="int64")
 
         out_1 = paddle.empty(shape=[200, 3], dtype=dtype)
         out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype)
         out_3 = paddle.empty(shape=shape_tensor_int64, dtype=dtype)
         out_4 = paddle.empty(shape=[200, positive_2_int32], dtype=dtype)
         out_5 = paddle.empty(shape=[200, positive_2_int64], dtype=dtype)
+        out_6 = paddle.empty(shape=shape_tensor_unknown, dtype=dtype)
 
         place = paddle.CPUPlace()
         exe = paddle.static.Executor(place)
-        res_1, res_2, res_3, res_4, res_5 = exe.run(
+        res_1, res_2, res_3, res_4, res_5, res_6 = exe.run(
             fluid.default_main_program(),
             feed={
                 "shape_tensor_int32": np.array([200, 3]).astype("int32"),
                 "shape_tensor_int64": np.array([200, 3]).astype("int64"),
+                "shape_tensor_unknown": np.array([200, 3]).astype("int64"),
             },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5])
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6])
 
         self.__check_out__(res_1, dtype)
         self.__check_out__(res_2, dtype)
         self.__check_out__(res_3, dtype)
         self.__check_out__(res_4, dtype)
         self.__check_out__(res_5, dtype)
+        self.__check_out__(res_6, dtype)
 
 
 class TestEmptyError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
index 87c4656cfa809..a460c5f252777 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
@@ -22,8 +22,7 @@
 import paddle
 import paddle.nn as nn
 from paddle.dataset.common import DATA_HOME
-from paddle.fluid.framework import core, _non_static_mode, _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import core, _non_static_mode, _test_eager_guard
 from paddle.fluid.layer_helper import LayerHelper
 from paddle import _C_ops
 
@@ -151,13 +150,12 @@ def predict(self, data):
 class TestBertTokenizerOp(unittest.TestCase):
     def setUp(self):
         self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
-        self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab)
-        self.init_data()
         self.save_path = os.path.join(DATA_HOME, "fast_tokenizer")
         self.param_path = os.path.join(self.save_path, "model.pdparams")
         self.inference_path = os.path.join(self.save_path, "inference")
 
     def init_data(self):
+        self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab)
         self.text = [
             '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。'
             '酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，'
@@ -179,8 +177,8 @@ def init_data(self):
         self.texts_tensor = to_string_tensor(self.texts, "texts")
         self.text_pairs_tensor = to_string_tensor(self.text_pairs, "text_pairs")
 
-    def test_padding(self):
-
+    def run_padding(self):
+        self.init_data()
         self.max_seq_len = 128
         self.pad_to_max_seq_len = True
         self.is_split_into_words = False
@@ -283,7 +281,13 @@ def test_padding(self):
             np.allclose(
                 token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
-    def test_no_padding(self):
+    def test_padding(self):
+        with _test_eager_guard():
+            self.run_padding()
+        self.run_padding()
+
+    def run_no_padding(self):
+        self.init_data()
         self.max_seq_len = 128
         self.pad_to_max_seq_len = False
         self.is_split_into_words = False
@@ -336,7 +340,13 @@ def test_no_padding(self):
             np.allclose(
                 token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
-    def test_is_split_into_words(self):
+    def test_no_padding(self):
+        with _test_eager_guard():
+            self.run_no_padding()
+        self.run_no_padding()
+
+    def run_is_split_into_words(self):
+        self.init_data()
         self.is_split_into_words = True
 
         input_ids, token_type_ids = self.faster_tokenizer(
@@ -355,7 +365,13 @@ def test_is_split_into_words(self):
             np.allclose(
                 token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
+    def test_is_split_into_words(self):
+        with _test_eager_guard():
+            self.run_is_split_into_words()
+        self.run_is_split_into_words()
+
     def test_inference(self):
+        self.init_data()
         if not os.path.exists(self.save_path):
             os.makedirs(self.save_path, exist_ok=True)
         paddle.save(self.faster_tokenizer.state_dict(), self.param_path)
@@ -383,6 +399,7 @@ def test_inference(self):
                 token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
     def test_feed_string_var(self):
+        self.init_data()
         paddle.enable_static()
         x = paddle.static.data(
             name="x", shape=[-1], dtype=core.VarDesc.VarType.STRINGS)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
index 4655b628dab4d..f382d61c63743 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
@@ -52,7 +52,7 @@ def test_ps_rolemaker(self):
         self.assertTrue(ro.is_server())
         self.assertEqual(ro.worker_num(), 2)
 
-    def test_traing_role(self):
+    def test_training_role(self):
         """Test training role."""
         os.environ["TRAINING_ROLE"] = "TEST"
         ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index 5e8be9a852273..86ee0db30ef8c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -116,7 +116,7 @@ def test_ps_rolemaker(self):
         self.assertEqual(ro._all_gather(1, "worker"), 1)
         self.assertEqual(ro._all_reduce(1, "sum", "worker"), 1)
 
-    def test_traing_role(self):
+    def test_training_role(self):
         """Test training role."""
         os.environ["TRAINING_ROLE"] = "TEST"
 
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index 108469cf8a732..723c4609bc96b 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -80,8 +80,10 @@ def test_api_eager(self):
         with fluid.dygraph.base.guard():
             with _test_eager_guard():
                 positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2)
-
                 positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2)
+                positive_4_int64 = fluid.layers.fill_constant([1], "int64", 4,
+                                                              True)
+
                 out_1 = paddle.full(
                     shape=[1, 2], dtype="float32", fill_value=1.1)
 
@@ -108,8 +110,19 @@ def test_api_eager(self):
                     shape=[1], dtype=np.float32, value=1.1)
                 out_7 = paddle.full(
                     shape=[1, 2], dtype=np.float32, fill_value=val)
+
+                out_8 = paddle.full(
+                    shape=positive_2_int32, dtype="float32", fill_value=1.1)
+
+                out_9 = paddle.full(
+                    shape=[
+                        positive_2_int32, positive_2_int64, positive_4_int64
+                    ],
+                    dtype="float32",
+                    fill_value=1.1)
+
                 # test for numpy.float64 as fill_value
-                out_8 = paddle.full_like(
+                out_10 = paddle.full_like(
                     out_7, dtype=np.float32, fill_value=np.abs(1.1))
 
                 assert np.array_equal(
@@ -133,8 +146,12 @@ def test_api_eager(self):
                 assert np.array_equal(
                     out_7, np.full(
                         [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(out_8, np.full([2], 1.1, dtype="float32"))
+                assert np.array_equal(
+                    out_9, np.full(
+                        [2, 2, 4], 1.1, dtype="float32"))
                 assert np.array_equal(
-                    out_8, np.full(
+                    out_10, np.full(
                         [1, 2], 1.1, dtype="float32"))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index a3ae2a20dba23..67160f59952ef 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -26,7 +26,8 @@
 from paddle.fluid import layers
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 default_main_program().random_seed = 42
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index a533b5d87a5a9..8c68eb243aea8 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -23,7 +23,8 @@
 from paddle.nn.layer.common import Linear, Dropout
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 
 class TestFusedFFNOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index 8f77972de8656..67f382a439d8c 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -109,6 +109,7 @@ def config(self):
 
         self.x_type = np.float32
         self.attn_mask_type = np.float64
+        #self.attn_mask_type = np.bool
         self.pre_layer_norm = True
         self.has_attn_mask = True
 
@@ -168,6 +169,11 @@ def generate_input_data(self):
                     self.attn_mask = (self.attn_mask - 1.0) * 1e4
                 else:
                     self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e4
+            elif self.attn_mask_type == np.bool:
+                if self.has_cache_kv and not self.gen_cache_kv:
+                    self.attn_mask[:, :, :, -2] = 0
+                else:
+                    self.attn_mask = np.tril(self.attn_mask)
             else:
                 raise ValueError(
                     "'attn_mask_type' should be 'int64' or 'float64'.")
@@ -394,7 +400,7 @@ def GetFusedMultiTransformerOut(self):
         epsilon = 1e-05
         ln2_epsilon = 1e-05
 
-        if attn_mask is not None:
+        if attn_mask is not None and self.attn_mask_type != np.bool:
             attn_mask = _convert_attention_mask(attn_mask, x.dtype)
 
         qkv_weights, qkv_biases = [], []
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 7984ca5571658..20a55af15c441 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -162,7 +162,7 @@ def check_clip_result(self, out, out_clip):
                 "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}".
                 format(u, v, u - v))
 
-    # test whether the ouput is right when use 'set_gradient_clip'
+    # test whether the output is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -172,7 +172,7 @@ def func(params_grads):
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_new_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -181,7 +181,7 @@ def func(params_grads):
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
 
-    # test whether the ouput is right when use grad_clip under float64
+    # test whether the output is right when use grad_clip under float64
     def test_new_gradient_clip_fp64(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -267,7 +267,7 @@ def check_clip_result(self, out, out_clip):
                     a=u, b=v, rtol=1e-5, atol=1e-8),
                 "gradient clip by norm has wrong results!")
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
@@ -311,7 +311,7 @@ def check_clip_result(self, out, out_clip):
                     a=u, b=v, rtol=1e-6, atol=1e-8),
                 "gradient clip by value has wrong results!")
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
@@ -397,7 +397,7 @@ def check_clip_result(self, loss, optimizer):
         self.assertTrue(
             np.isclose(
                 a=a, b=b, rtol=1e-6, atol=1e-8),
-            "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+            "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
             % (a, b))
 
 
@@ -426,7 +426,7 @@ def check_clip_result(self, loss, optimizer):
             self.assertTrue(
                 np.isclose(
                     a=a, b=b, rtol=1e-6, atol=1e-8),
-                "gradient clip by norm has wrong results, expetcd:%f, but recieved:%f"
+                "gradient clip by norm has wrong results, expetcd:%f, but received:%f"
                 % (a, b))
 
 
@@ -517,7 +517,7 @@ def test_gradient_clip(self):
                 self.assertTrue(
                     np.isclose(
                         a=a, b=b, rtol=1e-3, atol=1e-8),
-                    "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+                    "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
                     % (a, b))
 
 
@@ -563,7 +563,7 @@ def test_gradient_clip(self):
             self.assertTrue(
                 np.isclose(
                     a=a, b=b, rtol=1e-6, atol=1e-8),
-                "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+                "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
                 % (a, b))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 965ae65614a40..51ff8ec943d01 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -198,7 +198,7 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="For 'TestHSigmoidOpSparse', check_grad is is separately calculated by 'TestHSigmoidOpWithSparseGrad'."
+    reason="For 'TestHSigmoidOpSparse', check_grad is separately calculated by 'TestHSigmoidOpWithSparseGrad'."
 )
 class TestHSigmoidOpSparse(OpTest):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 60dd4948f996e..96a818549e700 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -321,6 +321,70 @@ def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False):
     return learning_rate * math.pow(gamma, epoch_num // step_size)
 
 
+def one_cycle_lr(epoch_num,
+                 max_learning_rate,
+                 total_steps,
+                 divide_factor=25,
+                 end_learning_rate=0.0001,
+                 phase_pct=0.3,
+                 anneal_strategy='cos',
+                 three_phase=False,
+                 verbose=False):
+    initial_lr = max_learning_rate / divide_factor
+    if three_phase:
+        _end_steps = [
+            float(phase_pct * total_steps) - 1,
+            float(2 * phase_pct * total_steps) - 2, total_steps - 1
+        ]
+        _schedule_phases = [
+            {
+                'start_lr': initial_lr,
+                'end_lr': max_learning_rate,
+            },
+            {
+                'start_lr': max_learning_rate,
+                'end_lr': initial_lr,
+            },
+            {
+                'start_lr': initial_lr,
+                'end_lr': end_learning_rate,
+            },
+        ]
+    else:
+        _end_steps = [float(phase_pct * total_steps) - 1, total_steps - 1]
+        _schedule_phases = [
+            {
+                'start_lr': initial_lr,
+                'end_lr': max_learning_rate,
+            },
+            {
+                'start_lr': max_learning_rate,
+                'end_lr': end_learning_rate,
+            },
+        ]
+
+    if anneal_strategy == 'cos':
+
+        def anneal_func(start, end, pct):
+            cos_out = math.cos(math.pi * pct) + 1
+            return end + (start - end) / 2.0 * cos_out
+    else:
+
+        def anneal_func(start, end, pct):
+            return (end - start) * pct + start
+
+    start_step = 0
+    for i, phase in enumerate(_schedule_phases):
+        end_step = _end_steps[i]
+        if epoch_num <= end_step or i == len(_schedule_phases) - 1:
+            pct = (epoch_num - start_step) / (end_step - start_step)
+            computed_lr = anneal_func(phase['start_lr'], phase['end_lr'], pct)
+            break
+        start_step = end_step
+
+    return computed_lr
+
+
 class TestLRScheduler(unittest.TestCase):
     def _test_static(self, python_func, paddle_api, kwarg, place):
         scheduler = paddle_api(**kwarg)
@@ -467,6 +531,33 @@ def test_scheduler(self):
         with self.assertRaises(ValueError):
             paddle.optimizer.lr.MultiStepDecay(
                 learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate='test', total_steps=20)
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=-1.5, total_steps=20)
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps=20, end_learning_rate='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps=20, end_learning_rate=-1)
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps=-10)
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps=20, anneal_strategy='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1,
+                total_steps=20,
+                phase_pct=0.6,
+                three_phase=True)
 
         func_api_kwargs = [(noam_lr, paddle.optimizer.lr.NoamDecay, {
             "d_model": 0.01,
@@ -527,6 +618,38 @@ def test_scheduler(self):
             "learning_rate": 0.5,
             "T_max": 10,
             "verbose": False
+        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+            "max_learning_rate": 0.1,
+            "total_steps": 20,
+            "divide_factor": 5,
+            "end_learning_rate": 0.0001,
+            "anneal_strategy": 'cos',
+            "phase_pct": 0.3,
+            "three_phase": False,
+        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+            "max_learning_rate": 0.5,
+            "total_steps": 20,
+            "divide_factor": 10,
+            "end_learning_rate": 0.001,
+            "anneal_strategy": 'linear',
+            "phase_pct": 0.4,
+            "three_phase": False,
+        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+            "max_learning_rate": 1.0,
+            "total_steps": 20,
+            "divide_factor": 9,
+            "end_learning_rate": 0.0001,
+            "anneal_strategy": 'cos',
+            "phase_pct": 0.3,
+            "three_phase": True,
+        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+            "max_learning_rate": 0.3,
+            "total_steps": 20,
+            "divide_factor": 25,
+            "end_learning_rate": 0.0005,
+            "anneal_strategy": 'linear',
+            "phase_pct": 0.2,
+            "three_phase": True,
         })]
 
         for python_func, paddle_api, kwarg in func_api_kwargs:
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index d89465c5aecab..3a100cd321e03 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -157,6 +157,9 @@ def test_grad(self):
 
 
 class TestTileDoubleGradCheck(unittest.TestCase):
+    def tile_wrapper(self, x):
+        return paddle.tile(x[0], [4, 9])
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 12]
@@ -171,6 +174,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.tile_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -181,6 +186,9 @@ def test_grad(self):
 
 
 class TestExpandV2DoubleGradCheck(unittest.TestCase):
+    def expand_wrapper(self, x):
+        return paddle.expand(x[0], [4, 12])
+
     @prog_scope()
     def func(self, place):
         x_shape = [1, 12]
@@ -195,6 +203,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.expand_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -253,6 +263,9 @@ def test_grad(self):
 
 
 class TestClipDoubleGradCheck(unittest.TestCase):
+    def clip_wrapper(self, x):
+        return paddle.clip(x[0], min=-1., max=1.)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 4, 10]
@@ -264,6 +277,8 @@ def func(self, place):
         x_arr = np.random.uniform(-5., 5., x_shape).astype(dtype)
 
         gradient_checker.double_grad_check([x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.clip_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -357,6 +372,9 @@ def func(self, place):
 
 
 class TestConcatDoubleGradCheck(unittest.TestCase):
+    def concat_wrapper(self, x):
+        return paddle.concat(x, axis=0)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
@@ -373,6 +391,11 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x1, x2], out, x_init=[x1_arr, x2_arr], place=place)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.concat_wrapper, [x1, x2],
+            out,
+            x_init=[x1_arr, x2_arr],
+            place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index ba1e9be815de6..a0c5ce77f1d25 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -1123,7 +1123,7 @@ def test_api_eager_dygraph(self):
 
 class TestMasterWeightSaveForFP16(unittest.TestCase):
     '''
-    For Amp-O2, some optimizer(Momentum, Adam ...) will create master weights for parameters to to improve the accuracy.
+    For Amp-O2, some optimizer(Momentum, Adam ...) will create master weights for parameters to improve the accuracy.
     Master weights will be saved by optimizer::state_dict.
     '''
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
index 20a5fcb7af3b1..9b48a87bff7b9 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
@@ -32,7 +32,7 @@ def test_seresnext_with_learning_rate_decay(self):
         self._compare_result_with_origin_model(
             check_func,
             use_device=DeviceType.CPU,
-            compare_seperately=False,
+            compare_separately=False,
             delta2=1e-3)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
index 9d1364cc592fe..ff529ce94bd25 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
@@ -30,7 +30,7 @@ def test_seresnext_with_learning_rate_decay(self):
             optimizer=seresnext_net.optimizer,
             use_parallel_executor=False)
         self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CUDA, compare_seperately=False)
+            check_func, use_device=DeviceType.CUDA, compare_separately=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index 8f46119d551c6..04772a2da2871 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -63,7 +63,7 @@ def test_dist_train_one_device(self):
                 "pipeline_mnist_one_device.py",
                 check_error_log=True,
                 log_name=flag_name,
-                need_envs=self.need_envs())
+                need_envs={"PADDLE_MANUAL_PIPELINE_STAGE": "0"})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index 365d3f931c27c..28bcc379fb9a0 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -51,7 +51,9 @@ def write_file(name, ct):
 
 def get_files(pth, prefix):
     return [
-        f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix)
+        f for f in listdir(pth)
+        if isfile(join(pth, f)) and f.startswith(prefix) and f !=
+        f"{prefix}.gpu.log"
     ]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py b/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py
deleted file mode 100644
index b4abbd56303ff..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle
-from paddle.fluid.framework import _test_eager_guard
-
-
-class TestSparseActivation(unittest.TestCase):
-    def test_sparse_relu(self):
-        with _test_eager_guard():
-            x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]]
-
-            def dense_relu(x):
-                dense_x = paddle.to_tensor(
-                    x, dtype='float32', stop_gradient=False)
-                dense_relu = paddle.nn.ReLU()
-                dense_out = dense_relu(dense_x)
-                dense_out.backward(dense_out)
-                return dense_out, dense_x.grad
-
-            dense_x = paddle.to_tensor(x, dtype='float32', stop_gradient=False)
-            sparse_dim = 2
-            sparse_x = dense_x.to_sparse_coo(sparse_dim)
-            sparse_relu = paddle.sparse.ReLU()
-            sparse_out = sparse_relu(sparse_x)
-            sparse_out.backward(sparse_out)
-
-            dense_out, dense_x_grad = dense_relu(x)
-            assert np.array_equal(dense_out.numpy(),
-                                  sparse_out.to_dense().numpy())
-            assert np.array_equal(dense_x_grad.numpy(),
-                                  sparse_x.grad.to_dense().numpy())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
new file mode 100644
index 0000000000000..573cc5ba8cf5d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from typing import Union, Callable
+import numpy as np
+import paddle
+from paddle.fluid.framework import _test_eager_guard
+from paddle import _C_ops
+
+
+class TestSparseUnary(unittest.TestCase):
+    def assert_raises_on_dense_tensor(self, sparse_func):
+        with _test_eager_guard():
+            dense_x = paddle.ones((2, 3))
+            with self.assertRaises(ValueError):
+                sparse_func(dense_x)
+
+    def compare_with_dense(
+            self,
+            x,
+            to_sparse: Callable[[paddle.Tensor], paddle.Tensor],
+            dense_func: Callable[[paddle.Tensor], paddle.Tensor],
+            sparse_func: Callable[[paddle.Tensor], paddle.Tensor],
+            test_gradient: bool, ):
+        def tensor_allclose(dense_tensor: paddle.Tensor,
+                            sparse_tensor: paddle.Tensor):
+            dense_numpy = dense_tensor.numpy()
+            mask = ~np.isnan(dense_numpy)
+            return np.allclose(dense_numpy[mask],
+                               sparse_tensor.to_dense().numpy()[mask])
+
+        with _test_eager_guard():
+            dense_x = paddle.to_tensor(
+                x, dtype="float32", stop_gradient=not test_gradient)
+
+            sparse_x = to_sparse(dense_x)
+            sparse_out = sparse_func(sparse_x)
+
+            dense_x = paddle.to_tensor(
+                x, dtype="float32", stop_gradient=not test_gradient)
+            dense_out = dense_func(dense_x)
+
+            assert tensor_allclose(dense_out, sparse_out)
+
+            if test_gradient:
+                dense_out.backward(dense_out)
+                sparse_out.backward(sparse_out)
+                assert tensor_allclose(dense_x.grad, sparse_x.grad)
+
+    def test_sparse_relu(self):
+        x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.nn.ReLU(),
+            paddle.sparse.ReLU(),
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.nn.ReLU(),
+            paddle.sparse.ReLU(),
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.ReLU())
+
+    def test_sparse_sqrt(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.sqrt,
+            paddle.sparse.sqrt,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.sqrt,
+            paddle.sparse.sqrt,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.sqrt)
+
+    def test_sparse_sin(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.sin,
+            paddle.sparse.sin,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.sin,
+            paddle.sparse.sin,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.sin)
+
+    def test_sparse_tanh(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, -4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.tanh,
+            paddle.sparse.tanh,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.tanh,
+            paddle.sparse.tanh,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.tanh)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
index 29f3308988f6d..9ac016511c20d 100644
--- a/python/paddle/fluid/tests/unittests/test_tensordot.py
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import unittest
-import paddle.fluid.core as core
-import numpy as np
 import itertools as it
+import numpy as np
+import unittest
 
-np.set_printoptions(threshold=np.inf)
+import paddle
+import paddle.fluid.core as core
 
 
 def tensordot_np(x, y, axes):
@@ -68,9 +67,16 @@ def tensordot_np(x, y, axes):
 
 class TestTensordotAPI(unittest.TestCase):
     def setUp(self):
+        self.set_place()
         self.set_dtype()
         self.set_input_shape()
         self.set_input_data()
+        self.set_test_axes()
+
+    def set_place(self):
+        self.places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(core.CUDAPlace(0))
 
     def set_dtype(self):
         self.dtype = np.float32
@@ -82,33 +88,8 @@ def set_input_shape(self):
     def set_input_data(self):
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.y = np.random.random(self.y_shape).astype(self.dtype)
-        self.all_axes = [2]
 
-    def run_dygraph(self, place):
-        paddle.disable_static()
-        x = paddle.to_tensor(self.x, place=place)
-        y = paddle.to_tensor(self.y, place=place)
-        paddle_res = paddle.tensordot(x, y, self.axes)
-        np_res = tensordot_np(self.x, self.y, self.axes)
-        np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
-
-    def run_static(self, place):
-        paddle.enable_static()
-        with paddle.static.program_guard(paddle.static.Program(),
-                                         paddle.static.Program()):
-            x = paddle.static.data(
-                name='x', shape=self.x_shape, dtype=self.dtype)
-            y = paddle.static.data(
-                name='y', shape=self.y_shape, dtype=self.dtype)
-            z = paddle.tensordot(x, y, self.axes)
-            exe = paddle.static.Executor(place)
-            paddle_res = exe.run(feed={'x': self.x,
-                                       'y': self.y},
-                                 fetch_list=[z])
-            np_res = tensordot_np(self.x, self.y, self.axes)
-            np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
-
-    def test_cases(self):
+    def set_test_axes(self):
         self.all_axes = []
         axial_index = range(4)
         all_permutations = list(it.permutations(axial_index, 0)) + list(
@@ -136,57 +117,146 @@ def test_cases(self):
 
         self.all_axes.extend(range(5))
 
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
+    def test_dygraph(self):
+        paddle.disable_static()
+        for axes in self.all_axes:
+            for place in self.places:
+                x = paddle.to_tensor(self.x, place=place)
+                y = paddle.to_tensor(self.y, place=place)
+                paddle_res = paddle.tensordot(x, y, axes)
+                np_res = tensordot_np(self.x, self.y, axes)
+                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
+
+    def test_static(self):
+        paddle.enable_static()
         for axes in self.all_axes:
-            self.axes = axes
-            for place in places:
-                self.run_dygraph(place)
-                self.run_static(place)
+            for place in self.places:
+                with paddle.static.program_guard(paddle.static.Program(),
+                                                 paddle.static.Program()):
+                    x = paddle.static.data(
+                        name='x', shape=self.x_shape, dtype=self.dtype)
+                    y = paddle.static.data(
+                        name='y', shape=self.y_shape, dtype=self.dtype)
+                    z = paddle.tensordot(x, y, axes)
+                    exe = paddle.static.Executor(place)
+                    paddle_res = exe.run(feed={'x': self.x,
+                                               'y': self.y},
+                                         fetch_list=[z])
+                    np_res = tensordot_np(self.x, self.y, axes)
+                    np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
 
 
 class TestTensordotAPIFloat64(TestTensordotAPI):
+    # Only test a small part of axes case for Float64 type
+    def set_test_axes(self):
+        self.all_axes = [
+            [[3, 2], [3]], [[2, 1, 0], [2, 1]], [[1, 2, 0], [1, 3, 2]], [3, 0],
+            [[], [0, 3, 1]], [[2, 1, 0, 3], [2, 0, 1, 3]],
+            [[3, 1, 2], [1, 3, 2, 0]], [[2, 1], [0, 2]], [[2, 0, 1, 3], [2]],
+            [[1, 2, 0, 3], [0, 2, 1]], [[2, 1, 3, 0], [1, 2, 3]],
+            [[2, 0, 1, 3], [3, 1, 0, 2]], [[0, 3], [0, 3, 2, 1]],
+            [[1, 3, 2, 0], [2, 1, 0, 3]], [[1, 3, 2, 0], [1, 3, 2, 0]],
+            [[1, 0, 2], [0, 1]], [[2, 3, 0], [3, 1]],
+            [[1, 3, 2, 0], [3, 0, 1, 2]], [[3, 2, 1], [2, 0, 1]], [[0], []],
+            [[2, 3, 0], [1, 2, 0]], [[3, 0, 2, 1], [2, 1, 0, 3]],
+            [[3, 1, 2], [2, 3, 1]], [[1, 0, 2, 3], []], [[1, 2], [1, 2, 3]],
+            [[2, 0, 1, 3], [2, 0, 1]], [[3, 1, 2], [1, 3, 2]],
+            [[3, 1, 2, 0], [1, 2, 3, 0]], [[0, 2, 3], [0, 1, 2]],
+            [[3, 2, 0], [2, 0, 3, 1]], [[2, 1, 0, 3], [3, 1, 2, 0]],
+            [[1, 2, 3, 0], [1, 3, 0, 2]], [[3, 0], [2, 1]],
+            [[0, 1, 3, 2], [0, 2, 1, 3]], [[1, 0], [2, 1, 3]],
+            [[1, 0, 3, 2], [2, 3, 0, 1]], [[1, 2], [3]],
+            [[1, 2, 3, 0], [3, 2, 1, 0]], [[0, 3, 2, 1], [2, 1, 3, 0]], [0],
+            [[0, 2, 3], [3, 2, 0, 1]], [[1, 2, 3, 0], [3, 2, 1, 0]],
+            [[3, 1], [3]], [[3, 2, 0, 1], [3, 2, 0]], [[2, 3, 0, 1], [0, 3, 2]],
+            [[1], [1, 3]], [[1, 2], [2, 1, 0]], [[3, 1, 2], [3, 1, 0]],
+            [[1, 3], [3, 1, 2]], [[2, 0, 1, 3], [3, 1, 0, 2]],
+            [[1, 3, 0], [1, 3]], [[2, 3, 1], [1, 0, 2]],
+            [[1, 2, 0, 3], [0, 2, 1, 3]], [[2], [0, 1, 3]], [[1], [1, 2]],
+            [[1, 0, 2, 3], [3, 0, 1, 2]], [[0, 1, 3, 2], [1, 3, 0, 2]],
+            [[3, 0, 2, 1], [0, 2, 3]], [[1, 2, 0], [1, 2, 3]],
+            [[1, 0, 3], [2, 3, 0]], [[2, 3, 0], [3, 1, 0]], [[1, 3], [1, 0]],
+            [[2, 1, 0, 3], [2, 0, 3, 1]], [[3, 2, 0], [2, 1, 0]],
+            [[0, 1, 3], [0, 3, 1]], [[3, 1, 0], [3, 2, 1]], [[3, 2], [3, 1]],
+            [[3], [2, 1, 0]], [[1, 2, 3, 0], []], [[1, 3, 2, 0], [3, 1, 2]],
+            [[1], [0, 2]], [[3, 2, 0], [3, 2, 0]], [[3], []],
+            [[1, 0, 3], [2, 1]], [[3, 1, 0, 2], [2, 3, 1, 0]],
+            [[0, 1], [0, 3, 2]], [[0, 2, 3], [0, 2, 1]], [[1, 3, 0], [3, 0, 2]],
+            [[3, 1, 2], [1, 2, 3]], [[3, 1, 2], [3, 1, 0]],
+            [[0, 3, 1, 2], [3, 2, 1, 0]], [[0, 3], [3, 2, 1]],
+            [[2, 3], [1, 3, 0]], [[0, 3, 2], [2, 0, 3, 1]], [[2, 3], [1, 3]],
+            [[3, 1, 2, 0], [2, 3, 1, 0]], [[1, 0, 3, 2], [3, 0, 1, 2]],
+            [[3, 2, 1, 0], [0, 1, 3, 2]], [[3, 1, 2], [3]],
+            [[0, 1, 3, 2], [2, 3, 0, 1]], [[1, 2, 3, 0], [1, 3, 0, 2]],
+            [3, 1, 2], [[3, 1, 2], [0, 3, 2]], [[2, 3, 0], [1, 2, 0]],
+            [[2, 0, 3], [2, 0]], [[3, 1, 0, 2], [3, 1, 0, 2]],
+            [[0, 1, 2], [2, 0, 1]], [[1, 0, 3], [2, 3, 0]],
+            [[2, 0, 1], [0, 1, 3]], [[2, 1], [0, 1, 3]]
+        ]
+
     def set_dtype(self):
         self.dtype = np.float64
 
 
+class TestTensordotAPIBroadcastCase1(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [1, 1, 1, 5]
+        self.y_shape = [1, 5, 1, 1]
+
+
+class TestTensordotAPIBroadcastCase2(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [1, 5, 5, 5]
+        self.y_shape = [1, 1, 1, 5]
+
+
+class TestTensordotAPIBroadcastCase3(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [5, 5, 5, 1]
+        self.y_shape = [5, 5, 1, 5]
+
+
+class TestTensordotAPIBroadcastCase4(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [5, 5, 5, 1]
+        self.y_shape = [1, 1, 1, 1]
+
+
+class TestTensordotAPIBroadcastCase5(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [1, 1, 5, 5]
+        self.y_shape = [5, 5, 1, 5]
+
+
 class TestTensordotAPIAxesType(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [3, 4, 4]
         self.y_shape = [4, 4, 5]
 
-    def test_cases(self):
+    def set_test_axes(self):
         self.all_axes = [
             0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )), (
                 (1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]),
             [[1, 2], [0, 1]]
         ]
 
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for axes in self.all_axes:
-            self.axes = axes
-            for place in places:
-                self.run_dygraph(place)
-                self.run_static(place)
-
+    def test_tensor_axes(self):
         # The 'axes' with type 'Tensor' in tensordot is not available in static mode
         paddle.disable_static()
-        for place in places:
-            self.all_axes = [
-                paddle.to_tensor([1]), (paddle.to_tensor([1])),
-                (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])),
-                [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])],
-                paddle.to_tensor([[1, 2], [0, 1]])
-            ]
-            for axes in self.all_axes:
-                self.axes = axes
-                for place in places:
-                    self.run_dygraph(place)
+        tensor_axes = [
+            paddle.to_tensor([1]), (paddle.to_tensor([1])),
+            (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])),
+            [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])],
+            paddle.to_tensor([[1, 2], [0, 1]])
+        ]
+
+        for place in self.places:
+            for axes in tensor_axes:
+                x = paddle.to_tensor(self.x, place=place)
+                y = paddle.to_tensor(self.y, place=place)
+                paddle_res = paddle.tensordot(x, y, axes)
+                np_res = tensordot_np(self.x, self.y, axes)
+                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
 
     def test_error(self):
         self.all_axes = [[[[0], [1]]], 0.1, -1, 100, [[1, 2], [0, 0]],
@@ -204,35 +274,5 @@ def set_dtype(self):
         self.dtype = np.float64
 
 
-class TestTensordotAPIBroadcastCase1(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [1, 1, 1, 5]
-        self.y_shape = [1, 5, 1, 1]
-
-
-class TestTensordotAPIBroadcastCase2(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [1, 5, 5, 5]
-        self.y_shape = [1, 1, 1, 5]
-
-
-class TestTensordotAPIBroadcastCase3(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [5, 5, 5, 1]
-        self.y_shape = [5, 5, 1, 5]
-
-
-class TestTensordotAPIBroadcastCase4(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [5, 5, 5, 1]
-        self.y_shape = [1, 1, 1, 1]
-
-
-class TestTensordotAPIBroadcastCase5(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [1, 1, 5, 5]
-        self.y_shape = [5, 5, 1, 5]
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index f6dc3fba6a214..95ad254a6dfb0 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -116,7 +116,7 @@ def init_test_case(self):
         self.output_size = None
 
 
-class TestUnpoolOpOuputsize(TestUnpoolOp):
+class TestUnpoolOpOutputsize(TestUnpoolOp):
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
@@ -127,7 +127,7 @@ def init_test_case(self):
         self.output_size = [9, 9]
 
 
-class TestUnpoolOpOuput(TestUnpoolOp):
+class TestUnpoolOpOutput(TestUnpoolOp):
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
index d5f4cef5b8759..fb1cd35c45380 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
@@ -37,6 +37,7 @@
     'dot',
     'elementwise_add',
     'elementwise_div',
+    'elementwise_heaviside',
     'elementwise_max',
     'elementwise_min',
     'elementwise_mul',
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
index 7aaa78856811f..b0bb9a37c16bd 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
@@ -23,41 +23,52 @@
 from paddle.fluid import compiler, Program, program_guard
 import paddle
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUAccuracyOp(OpTest):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.init_dtype()
-        n = 8192
-        infer = np.random.random((n, 1)).astype(self.dtype)
-        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
-        label = np.random.randint(0, 2, (n, 1)).astype('int64')
-        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
-        num_correct = 0
-        for rowid in range(n):
-            for ele in indices[rowid]:
-                if ele == label[rowid]:
-                    num_correct += 1
-                    break
-        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            'Correct': np.array([num_correct]).astype("int32"),
-            'Total': np.array([n]).astype("int32")
-        }
-        self.attrs = {'use_xpu': True}
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+class XPUTestAccuracyOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'accuracy'
+        self.use_dynamic_create_class = False
+
+    class TestXPUAccuracyOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "accuracy"
+            self.init_dtype()
+            n = 8192
+            infer = np.random.random((n, 1)).astype(self.dtype)
+            indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+            label = np.random.randint(0, 2, (n, 1)).astype('int64')
+            self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+            num_correct = 0
+            for rowid in range(n):
+                for ele in indices[rowid]:
+                    if ele == label[rowid]:
+                        num_correct += 1
+                        break
+            self.outputs = {
+                'Accuracy':
+                np.array([num_correct / float(n)]).astype(self.dtype),
+                'Correct': np.array([num_correct]).astype("int32"),
+                'Total': np.array([n]).astype("int32")
+            }
+            self.attrs = {'use_xpu': True}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
 
+support_types = get_xpu_op_support_types('accuracy')
+for stype in support_types:
+    create_test_class(globals(), XPUTestAccuracyOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
index c29150ef921c2..67fd9f871207b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
@@ -25,30 +25,43 @@
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
-class TestSGDOp(OpTest):
-    def setUp(self):
-        self.op_type = "sgd"
-        self.conf()
-        w = np.random.random((self.h, self.w)).astype("float32")
-        g = np.random.random((self.h, self.w)).astype("float32")
-        lr = np.array([0.1]).astype("float32")
 
-        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
-        self.outputs = {'ParamOut': w - lr * g}
+class XPUTestSgdOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sgd'
+        self.use_dynamic_create_class = False
 
-    def conf(self):
-        self.h = 102
-        self.w = 105
+    class TestSGDOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "sgd"
+            self.dtype = self.in_type
+            self.conf()
+            w = np.random.random((self.h, self.w)).astype(self.dtype)
+            g = np.random.random((self.h, self.w)).astype(self.dtype)
+            lr = np.array([0.1]).astype(self.dtype)
 
-    def test_check_output_with_place(self):
-        self.check_output_with_place(paddle.XPUPlace(0))
+            self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+            self.outputs = {'ParamOut': w - lr * g}
 
+        def conf(self):
+            self.h = 102
+            self.w = 105
 
-class TestSGDOpCase8X(TestSGDOp):
-    def conf(self):
-        self.h = 10
-        self.w = 64
+        def test_check_output_with_place(self):
+            self.check_output_with_place(paddle.XPUPlace(0))
+
+    class TestSGDOpCase8X(TestSGDOp):
+        def conf(self):
+            self.h = 10
+            self.w = 64
+
+
+support_types = get_xpu_op_support_types('sgd')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSgdOp, stype)
 
 
 class TestSGDOpWithLargeInput(unittest.TestCase):
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index c1891d24b88c9..8e8dd7855113b 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -1039,7 +1039,7 @@ def _legacy_load(path, **configs):
                                                                      config)
         else:
             # load state dict by `io.save_params/persistables` save format
-            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # TODO(chenweihang): [ Now only supports loading parameters separately ]
             # If users save all parameters as one file, the [ variable.name -> variable ]
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index c17a56fc28d88..a7a5e59f39409 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -29,7 +29,7 @@
 import paddle
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
 from paddle.fluid.framework import Variable
 from paddle.fluid.framework import _get_paddle_place
 from paddle.fluid.framework import _current_expected_place as _get_device
@@ -761,6 +761,15 @@ def eval_batch(self, inputs, labels=None):
         labels = [to_variable(l) for l in to_list(labels)]
 
         outputs = self.model.network.forward(*[to_variable(x) for x in inputs])
+
+        # Transfrom data to expected device
+        expected_device = paddle.device.get_device()
+        for o in to_list(outputs):
+            o._to(device=expected_device)
+
+        for l in labels:
+            l._to(device=expected_device)
+
         if self.model._loss:
             losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
@@ -915,7 +924,7 @@ class Model(object):
 
     When training on GPU, auto mixed precision (AMP O1) and pure float16 
     (AMP O2) training are both supported in static mode and dynamic mode.
-    In static graph mode, before traing with pure float16 (AMP O2),
+    In static graph mode, before training with pure float16 (AMP O2),
     `multi_precision` could be set to True when creating optimizer, which can
     avoid poor accuracy or slow convergence in a way, and inputs of dtype float
     should be cast to float16 by users. `paddle.static.amp.fp16_guard` API
@@ -2075,7 +2084,7 @@ def _run_one_epoch(
             #    [input1, input2, ..., label1, lable2, ...]
             # 3. custumed iterator yield concated inputs and labels:
             #   [input1, input2, ..., label1, lable2, ...]
-            # 4. custumed iterator yield seperated inputs and labels:
+            # 4. custumed iterator yield separated inputs and labels:
             #   ([input1, input2, ...], [label1, lable2, ...])
             # To handle all of these, flatten (nested) list to list.
             data = flatten(data)
@@ -2088,7 +2097,6 @@ def _run_one_epoch(
             callbacks.on_batch_begin(mode, step, logs)
 
             if mode != 'predict':
-
                 _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
                 if mode == 'train':
                     _inputs.append((step + 1) % self._accumulate == 0 or
diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py
index e98a23bc52d65..7ac555e2520ea 100644
--- a/python/paddle/incubate/autotune.py
+++ b/python/paddle/incubate/autotune.py
@@ -49,7 +49,7 @@ def set_config(config=None):
             dictionary, the key is the tuning type, and the value is a dictionary
             of the corresponding tuning parameters. If it is a string, the path of
             a json file will be specified and the tuning configuration will be set
-            by the the json file. Default: None, auto-tuning for kernel, layout and
+            by the json file. Default: None, auto-tuning for kernel, layout and
             dataloader will be enabled.
 
     Examples:
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index b620253b9f26f..cf56f74d1f12d 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -158,7 +158,7 @@ def _dygraph_clip(self, params_grads):
         normal_params_grads = []
         moe_params_grads = []
 
-        # seperate moe params from normal params
+        # separate moe params from normal params
         if self.moe_group is not None and self.moe_group.nranks > 1:
             for p, g in params_grads:
                 if self.is_expert_param_func(p):
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index d76b990958c94..072c7d9fccade 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -101,9 +101,9 @@ def __init__(self,
         super(FusedMultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
@@ -278,10 +278,10 @@ def __init__(self,
 
         super(FusedFeedForward, self).__init__()
         assert d_model > 0, (
-            "Expected d_model to be greater than 0, but recieved {}".format(
+            "Expected d_model to be greater than 0, but received {}".format(
                 d_model))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self._dtype = self._helper.get_default_dtype()
@@ -434,12 +434,12 @@ def __init__(self,
 
         super(FusedTransformerEncoderLayer, self).__init__()
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
         attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
         act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
         self.normalize_before = normalize_before
@@ -808,11 +808,11 @@ def __init__(self,
         super(FusedMultiTransformer, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self.normalize_before = normalize_before
diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py
index 23fd8dc0825f0..2065b3c1c94c0 100644
--- a/python/paddle/incubate/optimizer/functional/bfgs.py
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
@@ -49,16 +49,16 @@ def minimize_bfgs(objective_func,
         Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method).
 
     Args:
-        objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar.
-        initial_position (Tensor): the starting point of the iterates. 
+        objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar.
+        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . 
         max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
         tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
         tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
-        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None.
+        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None.
         line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'.
         max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50.
         initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0.
-        dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'.
+        dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'.
         name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None.
 
     Returns:
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
index f283381597733..e15ad56dc2d11 100644
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -50,17 +50,17 @@ def minimize_lbfgs(objective_func,
         Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).
 
     Args:
-        objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar.
-        initial_position (Tensor): the starting point of the iterates. 
+        objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar.
+        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . 
         history_size (Scalar): the number of stored vector pairs {si,yi}. Default value: 100.
         max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
         tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
         tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
-        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None.
+        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None.
         line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'.
         max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50.
         initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0.
-        dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'.
+        dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'.
         name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None.
 
     Returns:
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index ca3ac1772829d..d08821e510c2b 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -392,20 +392,24 @@ def hsigmoid_loss(input,
 
             paddle.set_device('cpu')
 
-            input = paddle.uniform([2, 3])
-            # [[-0.8018668   0.8736385  -0.9064771 ] # random
-            #  [-0.10228515 -0.87188244 -0.8783718 ]] # random
+            input = paddle.uniform([4, 3])
+            # [[0.45424712  -0.77296764  0.82943869] # random
+            #  [0.85062802  0.63303483  0.35312140] # random
+            #  [0.57170701  0.16627562  0.21588242] # random
+            #  [0.27610803  -0.99303514  -0.17114788]] # random
             label = paddle.to_tensor([0, 1, 4, 5])
             num_classes = 5
             weight=paddle.uniform([num_classes-1, 3])
-            # [[-0.24148715  0.8449961  -0.7399121 ] # random
-            #  [-0.9800559   0.43509364  0.9091208 ] # random
-            #  [ 0.60194826  0.10430074 -0.4521166 ] # random
-            #  [-0.4469818  -0.01536179 -0.604454  ]] # random
+            # [[-0.64477652  0.24821866  -0.17456549] # random
+            #  [-0.04635394  0.07473493  -0.25081766] # random
+            #  [ 0.05986035  -0.12185556  0.45153677] # random
+            #  [-0.66236806  0.91271877  -0.88088769]] # random
 
             out=F.hsigmoid_loss(input, label, num_classes, weight)
-            # [[3.0159328]
-            #  [2.2407534]]
+            # [[1.96709502]
+            #  [2.40019274]
+            #  [2.11009121]
+            #  [1.92374969]]
     """
 
     if _non_static_mode():
@@ -542,7 +546,7 @@ def margin_ranking_loss(input,
                         name=None):
     r"""
 
-    This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.
+    This op the calcluate the margin rank loss between the input, other and label, use the math function as follows.
 
     .. math::
         margin\_rank\_loss = max(0, -label * (input - other) + margin)
@@ -879,7 +883,7 @@ def kl_div(input, label, reduction='mean', name=None):
 
     While :attr:`reduction` is :attr:`none`, output loss is in
     the same shape as input, loss in each point is calculated
-    seperately and no reduction is applied.
+    separately and no reduction is applied.
 
     While :attr:`reduction` is :attr:`mean`, output loss is in
     shape of [1] and loss value is the mean value of all losses.
@@ -2006,7 +2010,7 @@ def sigmoid_focal_loss(logit,
             Available dtype is float32, float64.
         normalizer (Tensor, optional): The number normalizes the focal loss. It has to be
             a 1-D Tensor whose shape is `[1, ]`. The data type is float32, float64.
-            For object detection task, it is the the number of positive samples.
+            For object detection task, it is the number of positive samples.
             If set to None, the focal loss will not be normalized. Default is None.
         alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
             it should be between 0 and 1.  Default value is set to 0.25. 
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 3160f04e830d2..6a573005f4514 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1160,22 +1160,21 @@ def max_pool3d(x,
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
             # max pool3d
-            x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-            output = F.max_pool2d(x,
+            x = paddle.uniform([1, 3, 32, 32, 32])
+            output = F.max_pool3d(x,
                                   kernel_size=2,
                                   stride=2, padding=0)
-            output.shape [1, 3, 16, 16, 16]
+            # output.shape [1, 3, 16, 16, 16]
             # for return_mask=True
-            x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+            x = paddle.uniform([1, 3, 32, 32, 32])
             output, max_indices = paddle.nn.functional.max_pool3d(x,
                                           kernel_size = 2,
                                           stride = 2,
                                           padding=0,
                                           return_mask=True)
-            # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
+            # output.shape [1, 3, 16, 16, 16], max_indices.shape [1, 3, 16, 16, 16]
     """
     kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
     if stride is None:
@@ -1267,10 +1266,9 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     Returns:
             Tensor: The output tensor of adaptive average pooling result. The data type is same
                       as input tensor.
-    Raises:
-            ValueError: 'output_size' should be an integer.
     Examples:
         .. code-block:: python
+          :name: code-example1
 
               # average adaptive pool1d
               # suppose input data in shape of [N, C, L], `output_size` is m or [m],
@@ -1286,10 +1284,9 @@ def adaptive_avg_pool1d(x, output_size, name=None):
               #
               import paddle
               import paddle.nn.functional as F
-              import numpy as np
 
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              data = paddle.uniform([1, 3, 32])
+              pool_out = F.adaptive_avg_pool1d(data, output_size=16)
               # pool_out shape: [1, 3, 16])
     """
     pool_type = 'avg'
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index cd82fe12fff6b..7fd109843bede 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -367,7 +367,7 @@ class PReLU(Layer):
     Parameters:
         num_parameters (int, optional): Number of `weight` to learn. The supported values are:
             1 - a single parameter `alpha` is used for all input channels;
-            Number of channels - a seperate `alpha` is used for each input channel.
+            Number of channels - a separate `alpha` is used for each input channel.
             Default is 1.
         init (float, optional): Init value of learnable `weight`. Default is 0.25.
         weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`.
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index d4e059b6dfa49..a20e7de751d16 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -465,14 +465,18 @@ class HSigmoidLoss(Layer):
             import paddle
             paddle.set_device('cpu')
 
-            input = paddle.uniform([2, 3])
-            # [[-0.2820413   0.9528898  -0.81638825] # random
-            #  [-0.6733154  -0.33866507  0.25770962]] # random
+            input = paddle.uniform([4, 3])
+            # [[0.56194401  -0.22450298  -0.10741806] # random
+            #  [0.36136317  0.23556745  0.88748658] # random
+            #  [0.18151939  0.80947340  -0.31078976] # random
+            #  [0.68886101  -0.14239830  -0.41297770]] # random
             label = paddle.to_tensor([0, 1, 4, 5])
             m = paddle.nn.HSigmoidLoss(3, 5)
             out = m(input, label)
-            # [[2.4543471]
-            #  [1.9359267]]
+            # [[2.42524505]
+            #  [1.74917245]
+            #  [3.14571381]
+            #  [2.34564662]]
     """
 
     def __init__(self,
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index b0b6e62a602aa..340372f9b6a4e 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -163,9 +163,9 @@ def __init__(self,
         super(MultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected num_heads to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -508,12 +508,12 @@ def __init__(self,
         super(TransformerEncoderLayer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
@@ -813,12 +813,12 @@ def __init__(self,
         super(TransformerDecoderLayer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
@@ -1220,12 +1220,12 @@ def __init__(self,
         super(Transformer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         if isinstance(bias_attr, (list, tuple)):
             if len(bias_attr) == 1:
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 75266abdf0d13..56c9e83c38b06 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -178,7 +178,7 @@ def spectral_norm(layer,
        .. code-block:: python
 
             from paddle.nn import Conv2D
-            from paddle.nn.utils import Spectralnorm
+            from paddle.nn.utils import spectral_norm
 
             conv = Conv2D(3, 1, 3)
             sn_conv = spectral_norm(conv)
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index c131d218a1cde..84644ccc48445 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -213,15 +213,21 @@ def remove_weight_norm(layer, name='weight'):
     Examples:
         .. code-block:: python
           
-          import paddle
-          from paddle.nn import Conv2D
-          from paddle.nn.utils import weight_norm, remove_weight_norm
-
-          conv = Conv2D(3, 5, 3)
-          wn = weight_norm(conv)
-          remove_weight_norm(conv)
-          print(conv.weight_g)
-          # AttributeError: 'Conv2D' object has no attribute 'weight_g'
+            import paddle
+            from paddle.nn import Conv2D
+            from paddle.nn.utils import weight_norm, remove_weight_norm
+
+            conv = Conv2D(3, 5, 3)
+            wn = weight_norm(conv)
+            print(conv.weight_g)
+            # Parameter containing:
+            # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [0., 0., 0., 0., 0.])
+            # Conv2D(3, 5, kernel_size=[3, 3], data_format=NCHW)
+
+            remove_weight_norm(conv)
+            # print(conv.weight_g)
+            # AttributeError: 'Conv2D' object has no attribute 'weight_g'
     """
     for k, hook in layer._forward_pre_hooks.items():
         if isinstance(hook, WeightNorm) and hook.name == name:
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index ea4349bc0b2c5..12b8272707bd8 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -33,7 +33,8 @@
     'LambdaDecay',
     'ReduceOnPlateau',
     'CosineAnnealingDecay',
-    'MultiplicativeDecay'
+    'MultiplicativeDecay',
+    'OneCycleLR'
 ]
 
 
@@ -1591,3 +1592,212 @@ def get_lr(self):
         for epoch in range(1, self.last_epoch + 1):
             cur_lr = cur_lr * self.lr_lambda(epoch)
         return cur_lr
+
+
+class OneCycleLR(LRScheduler):
+    r"""
+    Sets the learning rate according to the one cycle learning rate scheduler.
+    The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
+    from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
+
+    It has been proposed in `Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates <https://arxiv.org/abs/1708.07120>`_.
+
+    Please note that the default behaviour of this scheduler follows the fastai implementation of one cycle,
+    which claims that “unpublished work has shown even better results by using only two phases”.
+    If you want the behaviour of this scheduler to be consistent with the paper, please set ``three_phase=True`` .
+
+    Also note that you should update learning rate each step.
+
+    Args:
+        max_learning_rate (float): The maximum learning rate. It is a python float number.
+             Functionally, it defines the initial learning rate by ``divide_factor`` .
+        total_steps (int): Number of total training steps.
+        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
+        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+        three_phase (bool, optional): Whether to use three phase.
+            If ``True``:
+                1. The learning rate will first increase from initial learning rate to maximum learning rate.
+                2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
+                3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
+            If ``False``:
+                1. The learning rate will increase to maximum learning rate.
+                2. Then it will directly decrease to minimum learning rate.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``OneCycleLR`` instance to schedule learning rate.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dynamic graph mode
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            for epoch in range(5):
+                for batch_id in range(20):
+                    x = paddle.uniform([10, 10])
+                    out = linear(x)
+                    loss = paddle.mean(out)
+                    loss.backward()
+                    sgd.step()
+                    sgd.clear_gradients()
+                    scheduler.step()        # You should update learning rate each step
+
+            # train on static graph mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(5):
+                for batch_id in range(20):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                    scheduler.step()    # You should update learning rate each step
+    """
+
+    def __init__(self,
+                 max_learning_rate,
+                 total_steps,
+                 divide_factor=25.,
+                 end_learning_rate=0.0001,
+                 phase_pct=0.3,
+                 anneal_strategy='cos',
+                 three_phase=False,
+                 last_epoch=-1,
+                 verbose=False):
+        # Check type and value of max_learning_rate
+        if not isinstance(max_learning_rate, (float, int)):
+            raise TypeError(
+                "'max_learning_rate' must be 'float' or 'int', but received {}".
+                format(type(total_steps)))
+        if max_learning_rate < 0:
+            raise ValueError("'max_learning_rate' must be a positive integer.")
+
+        # Check type and value of end_learning_rate
+        if not isinstance(end_learning_rate, (float, int)):
+            raise TypeError(
+                "'end_learning_rate' must be 'float' or 'int', but received {}".
+                format(type(total_steps)))
+        if end_learning_rate < 0:
+            raise ValueError("'end_learning_rate' must be a positive integer.")
+
+        # Check type and value of total_steps
+        if not isinstance(total_steps, int):
+            raise TypeError("'total_step' must be 'int', but received {}".
+                            format(type(total_steps)))
+        if total_steps <= 0:
+            raise ValueError("'total_step' must be a positive integer.")
+        self.total_steps = total_steps
+
+        # Check type and value of pac_start
+        if not isinstance(phase_pct, float):
+            raise TypeError("'phase_pct' must be 'float', but received {}".
+                            format(type(phase_pct)))
+        if phase_pct < 0 or phase_pct > 1:
+            raise ValueError(
+                "'phase_pct' must be between 0 and 1, but received {}".format(
+                    phase_pct))
+
+        # Check type and value of divide_factor
+        if not isinstance(divide_factor, (float, int)):
+            raise TypeError(
+                "'divide_factor' must be 'float' or 'int', but received {}".
+                format(type(divide_factor)))
+
+        initial_lr = max_learning_rate / float(divide_factor)
+        min_lr = float(end_learning_rate)
+
+        if three_phase:
+            if phase_pct >= 0.5:
+                raise ValueError(
+                    "When three_phase is True, 'phase_pct' must be less than 0.5"
+                )
+            # start step and end step of each phase.
+            self._step_config = [
+                0,
+                phase_pct * self.total_steps - 1,
+                2 * phase_pct * self.total_steps - 2,
+                self.total_steps - 1,
+                self.total_steps - 1,  # for the last step.
+            ]
+            # step size of each phase.
+            self._steps_size = [
+                self._step_config[1] - self._step_config[0],
+                self._step_config[2] - self._step_config[1],
+                self._step_config[3] - self._step_config[2],
+                self._step_config[3] -
+                self._step_config[2],  # for the last step.
+            ]
+            # start lr and end lr of each phase.
+            self._lr_config = [
+                initial_lr, max_learning_rate, initial_lr, min_lr
+            ]
+        else:
+            self._step_config = [
+                0, phase_pct * self.total_steps - 1, self.total_steps - 1,
+                self.total_steps - 1
+            ]
+            self._steps_size = [
+                self._step_config[1] - self._step_config[0],
+                self._step_config[2] - self._step_config[1],
+                self._step_config[2] - self._step_config[1],
+            ]
+            self._lr_config = [initial_lr, max_learning_rate, min_lr]
+
+        # Check anneal_strategy
+        if anneal_strategy == 'cos':
+            self.anneal_func = self._cos_annealing
+        elif anneal_strategy == 'linear':
+            self.anneal_func = self._linear_annealing
+        else:
+            raise ValueError(
+                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}".
+                format(anneal_strategy))
+        super(OneCycleLR, self).__init__(initial_lr, last_epoch, verbose)
+
+    def _cos_annealing(self, start_lr, end_lr, pct):
+        cos_out = math.cos(math.pi * pct) + 1
+        return end_lr + (start_lr - end_lr) / 2.0 * cos_out
+
+    def _linear_annealing(self, start_lr, end_lr, pct):
+        return (end_lr - start_lr) * pct + start_lr
+
+    def get_lr(self):
+        current_step = self.last_epoch
+
+        if current_step > self.total_steps:
+            raise ValueError(
+                "Tried to step {} times. However the number of total steps is {}"
+                .format(current_step, self.total_steps))
+
+        for (i, (end_step, step_size)
+             ) in enumerate(zip(self._step_config[1:], self._steps_size)):
+            # i == len(self._lr_config) - 2 catch the last step, otherwise it will return None.
+            if current_step <= end_step or i == len(self._lr_config) - 2:
+                # self._step_config[i] means start step of a phase.
+                percentage = (current_step - self._step_config[i]) / step_size
+                return self.anneal_func(self._lr_config[i],
+                                        self._lr_config[i + 1], percentage)
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 77adbaff34859..9df595bc3ae73 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -150,7 +150,7 @@ def getScheduleState(step: int) -> ProfilerState:
 
 def _default_state_scheduler(step: int):
     r"""
-    A default state scheduler, keep recording from the begining of the profiler until ending.
+    A default state scheduler, keep recording from the beginning of the profiler until ending.
     """
     return ProfilerState.RECORD
 
diff --git a/python/paddle/profiler/timer.py b/python/paddle/profiler/timer.py
index 1fb06ddc55e39..815775ebc6aad 100644
--- a/python/paddle/profiler/timer.py
+++ b/python/paddle/profiler/timer.py
@@ -193,7 +193,7 @@ def begin(self, benchmark):
     def before_reader(self, benchmark):
         """
         Initialize the start time of the dataloader. This function will be
-        called at the begining of `next` method in `_DataLoaderIterMultiProcess` or
+        called at the beginning of `next` method in `_DataLoaderIterMultiProcess` or
         `_DataLoaderIterSingleProcess`.
 
         """
@@ -220,8 +220,8 @@ def after_step(self, benchmark):
         Record the cost for the current step. It will contain the cost of the loading
         data if there is a dataloader. Similar to `after_reader`, it will also update
         the maximum, minimum and the total time from the step 11 to the current step
-        as well as the the maximum and minimum speed of the model. This function will
-        be called in in `Profiler.step()`.
+        as well as the maximum and minimum speed of the model. This function will
+        be called in `Profiler.step()`.
 
         """
 
@@ -401,7 +401,7 @@ def check_if_need_record(self, reader):
                 # enter a new task but not calling beign() to record it.
                 # we pause the timer until the end of new task, so that 
                 # the cost of new task is not added to the current event.
-                # eg. start evaluation in the traing task
+                # eg. start evaluation in the training task
                 self.current_event.need_record = False
         else:
             # when the new task exits, continue timing for the current event.
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index fd75ab9550d52..5e95c83129f53 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -78,7 +78,7 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
 
     def begin(self):
         r"""
-        Record the time of begining.
+        Record the time of beginning.
 
         Examples:
 
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py
index 93653e09c9019..26a2f0cfadbe7 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -14,15 +14,19 @@
 
 from .creation import sparse_coo_tensor
 from .creation import sparse_csr_tensor
-from .layer.activation import ReLU
-from .layer.norm import BatchNorm
+from .layer import ReLU
+from .layer import BatchNorm
 
-from .layer.conv import Conv3D
-from .layer.conv import SubmConv3D
+from .layer import Conv3D
+from .layer import SubmConv3D
 
-from .layer.pooling import MaxPool3D
+from .layer import MaxPool3D
+
+from .functional import sqrt
+from .functional import sin
+from .functional import tanh
 
 __all__ = [
     'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D',
-    'BatchNorm', 'MaxPool3D'
+    'BatchNorm', 'MaxPool3D', 'sqrt', 'sin', 'tanh'
 ]
diff --git a/python/paddle/sparse/functional/__init__.py b/python/paddle/sparse/functional/__init__.py
index f1ca4cc6fcc48..cfefa3ff4ff76 100644
--- a/python/paddle/sparse/functional/__init__.py
+++ b/python/paddle/sparse/functional/__init__.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .activation import relu  # noqa: F401
+from .unary import relu  # noqa: F401
+from .unary import tanh  # noqa: F401
+from .unary import sqrt  # noqa: F401
+from .unary import sin  # noqa: F401
 from .conv import conv3d  # noqa: F401
 from .conv import subm_conv3d  # noqa: F401
 from .pooling import max_pool3d  # noqa: F401
 
-__all__ = ['relu', 'conv3d', 'subm_conv3d', 'max_pool3d']
+__all__ = ['relu', 'tanh', 'conv3d', 'subm_conv3d', 'max_pool3d', 'sqrt', 'sin']
diff --git a/python/paddle/sparse/functional/activation.py b/python/paddle/sparse/functional/activation.py
deleted file mode 100644
index c0109bc4e2429..0000000000000
--- a/python/paddle/sparse/functional/activation.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = []
-
-from paddle import _C_ops, in_dynamic_mode
-
-
-def relu(x, name=None):
-    """
-    sparse relu activation.
-
-    .. math::
-
-        out = max(x, 0)
-
-    Parameters:
-        x (Tensor): The input Sparse Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        A Sparse Tensor with the same data type and shape as ``x`` .
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import numpy as np
-            from paddle.fluid.framework import _test_eager_guard
-
-            with _test_eager_guard():
-                dense_x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
-                sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.functional.relu(sparse_x) 
-    """
-
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-    assert x.is_sparse_coo(
-    ), "Currently, sparse.relu only support the input of SparseCooTensor"
-
-    return _C_ops.final_state_sparse_relu(x)
diff --git a/python/paddle/sparse/functional/unary.py b/python/paddle/sparse/functional/unary.py
new file mode 100644
index 0000000000000..860b4025d89e0
--- /dev/null
+++ b/python/paddle/sparse/functional/unary.py
@@ -0,0 +1,177 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+from paddle import _C_ops, in_dynamic_mode
+
+
+def relu(x, name=None):
+    """
+    sparse relu activation, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = max(x, 0)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.functional.relu(sparse_x) 
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo():
+        return _C_ops.final_state_sparse_coo_relu(x)
+    elif x.is_sparse_csr():
+        return _C_ops.final_state_sparse_csr_relu(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.relu only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def tanh(x, name=None):
+    """
+    sparse tanh activation, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = tanh(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.tanh(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo():
+        return _C_ops.final_state_sparse_coo_tanh(x)
+    elif x.is_sparse_csr():
+        return _C_ops.final_state_sparse_csr_tanh(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.tanh only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def sqrt(x, name=None):
+    """
+    Calculate square root of x, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = sqrt(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([4, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.sqrt(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo():
+        return _C_ops.final_state_sparse_coo_sqrt(x)
+    elif x.is_sparse_csr():
+        return _C_ops.final_state_sparse_csr_sqrt(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.sqrt only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def sin(x, name=None):
+    """
+    Calculate sin of x, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = sin(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.sin(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo():
+        return _C_ops.final_state_sparse_coo_sin(x)
+    elif x.is_sparse_csr():
+        return _C_ops.final_state_sparse_csr_sin(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.sin only support the input of SparseCooTensor or SparseCsrTensor"
+        )
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/sparse/layer/__init__.py
index 3a6d99392e4e8..8a814b514276f 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/sparse/layer/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .activation import ReLU
+from .unary import ReLU
 from .norm import BatchNorm
 from .conv import Conv3D
 from .conv import SubmConv3D
diff --git a/python/paddle/sparse/layer/activation.py b/python/paddle/sparse/layer/unary.py
similarity index 100%
rename from python/paddle/sparse/layer/activation.py
rename to python/paddle/sparse/layer/unary.py
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 0268382b88c73..283bce1cc817f 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -229,6 +229,7 @@
 from .math import fmin  # noqa: F401
 from .math import inner  # noqa: F401
 from .math import outer  # noqa: F401
+from .math import heaviside  # noqa: F401
 from .math import frac  # noqa: F401
 
 from .random import multinomial  # noqa: F401
@@ -495,6 +496,7 @@
            'put_along_axis',
            'put_along_axis_',
            'exponential_',
+           'heaviside',
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index a5a4df6571b77..5163e6e5395bd 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -92,13 +92,13 @@ def linspace(start, stop, num, dtype=None, name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
         with device_guard("cpu"):
-            tensor_start = fill_constant([1], dtype, start)
+            tensor_start = fill_constant([1], dtype, start, force_cpu=True)
     if not isinstance(stop, Variable):
         with device_guard("cpu"):
-            tensor_stop = fill_constant([1], dtype, stop)
+            tensor_stop = fill_constant([1], dtype, stop, force_cpu=True)
     if not isinstance(num, Variable):
         with device_guard("cpu"):
-            tensor_num = fill_constant([1], 'int32', num)
+            tensor_num = fill_constant([1], 'int32', num, force_cpu=True)
     if _non_static_mode():
         return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
                                dtype)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 127aa71137dff..22e3b7cd29e0e 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1489,7 +1489,8 @@ def roll(x, shifts, axis=None, name=None):
         x (Tensor): The x tensor as input.
         shifts (int|list|tuple): The number of places by which the elements
                            of the `x` tensor are shifted.
-        axis (int|list|tuple|None): axis(axes) along which to roll.
+        axis (int|list|tuple, optional): axis(axes) along which to roll. Default: None
+        name (str, optional): Name for the operation. Default: None
 
     Returns:
         Tensor: A Tensor with same data type as `x`.
@@ -1512,6 +1513,11 @@ def roll(x, shifts, axis=None, name=None):
             #[[7. 8. 9.]
             # [1. 2. 3.]
             # [4. 5. 6.]]
+            out_z3 = paddle.roll(x, shifts=1, axis=1)
+            print(out_z3)
+            #[[3. 1. 2.]
+            # [6. 4. 5.]
+            # [9. 7. 8.]]
     """
     origin_shape = x.shape
     if type(shifts) == int:
@@ -1530,8 +1536,6 @@ def roll(x, shifts, axis=None, name=None):
         axis = []
 
     if in_dygraph_mode():
-        if isinstance(shifts, paddle.Tensor):
-            shifts = shifts.cpu()
         return _C_ops.final_state_roll(x, shifts, axis)
 
     if _in_legacy_dygraph():
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 83501b0399492..9e2384a8d9cc8 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1388,10 +1388,10 @@ def add_n(inputs, name=None):
         if len(inputs) > 0:
             for input in inputs:
                 check_variable_and_dtype(input, "inputs", \
-                   ['float32', 'float64', 'int32', 'int64'], 'add_n')
+                   ['float16', 'float32', 'float64', 'int32', 'int64'], 'add_n')
     else:
         check_variable_and_dtype(inputs, "inputs", \
-                ['float32', 'float64', 'int32', 'int64'], 'add_n')
+                ['float16', 'float32', 'float64', 'int32', 'int64'], 'add_n')
 
 
     out = helper.create_variable_for_type_inference(
@@ -3346,7 +3346,7 @@ def increment(x, value=1.0, name=None):
 
 def all(x, axis=None, keepdim=False, name=None):
     """
-    Computes the the ``logical and`` of tensor elements over the given dimension.
+    Computes the ``logical and`` of tensor elements over the given dimension.
 
     Args:
         x (Tensor): An N-D Tensor, the input data type should be `bool`.
@@ -3442,7 +3442,7 @@ def all(x, axis=None, keepdim=False, name=None):
 
 def any(x, axis=None, keepdim=False, name=None):
     """
-    Computes the the ``logical or`` of tensor elements over the given dimension.
+    Computes the ``logical or`` of tensor elements over the given dimension.
 
     Args:
         x (Tensor): An N-D Tensor, the input data type should be `bool`.
@@ -3810,7 +3810,7 @@ def lerp(x, y, weight, name=None):
             x = paddle.arange(1., 5., dtype='float32')
             y = paddle.empty([4], dtype='float32')
             y.fill_(10.)
-            out = paddle.lerp(start, end, 0.5)
+            out = paddle.lerp(x, y, 0.5)
             # out: [5.5., 6., 6.5, 7.]
 
     """
@@ -4381,6 +4381,54 @@ def angle(x, name=None):
     helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
     return out
 
+def heaviside(x, y, name=None):
+    """
+    Computes the Heaviside step function determined by corresponding element in y for each element in x. The equation is
+
+    .. math::
+        heaviside(x, y)=
+            \left\{
+                \\begin{array}{lcl}
+                0,& &\\text{if} \ x < 0, \\\\
+                y,& &\\text{if} \ x = 0, \\\\
+                1,& &\\text{if} \ x > 0.
+                \end{array}
+            \\right.
+
+    Notes:
+        ``paddle.heaviside`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
+    Args:
+        x (Tensor): The input tensor of Heaviside step function, it's data type should be float32, float64, int32 or int64.
+        y (Tensor): The tensor that determines a Heaviside step function, it's data type should be float32, float64, int32 or int64.
+        name (str, optional): Name for the operation (optional, default is None). Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor. A location into which the result is stored. If x and y have different shapes and are broadcastable, the resulting tensor shape is the shape of x and y after broadcasting. If x, y have the same shape, its shape is the same as x and y.
+
+    Examples:
+        .. code-block:: python
+            :name: heaviside-example
+
+            import paddle
+            x = paddle.to_tensor([-0.5, 0, 0.5])
+            y = paddle.to_tensor([0.1])
+            paddle.heaviside(x, y)
+            #    [0.        , 0.10000000, 1.        ]
+            x = paddle.to_tensor([[-0.5, 0, 0.5], [-0.5, 0.5, 0]])
+            y = paddle.to_tensor([0.1, 0.2, 0.3])
+            paddle.heaviside(x, y)
+            #    [[0.        , 0.20000000, 1.        ],
+            #     [0.        , 1.        , 0.30000001]]
+     """
+    op_type = 'elementwise_heaviside'
+    axis = -1
+    act = None
+    if _non_static_mode():
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name=op_type)
+    return _elementwise_op(LayerHelper(op_type, **locals()))
+
 def frac(x, name=None):
     """
     This API is used to return the fractional portion of each element in input.
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 71c97d4cac986..42d3bf9fca364 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -42,7 +42,7 @@ def set_printoptions(precision=None,
     Args:
         precision (int, optional): Number of digits of the floating number, default 8.
         threshold (int, optional): Total number of elements printed, default 1000.
-        edgeitems (int, optional): Number of elements in summary at the begining and ending of each dimension, default 3.
+        edgeitems (int, optional): Number of elements in summary at the beginning and ending of each dimension, default 3.
         sci_mode (bool, optional): Format the floating number with scientific notation or not, default False.
         linewidth (int, optional): Number of characters each line, default 80.
        
diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
index eab34a6dafbc3..de0518e229b0a 100644
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -58,7 +58,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestDistTraning(unittest.TestCase):
-    def test_static_multiple_gpus(self):
+    def test_dynamic_multiple_gpus(self):
         device = set_device('gpu')
 
         im_shape = (-1, 1, 28, 28)
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index 16788e4656192..006800d3caeee 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -52,6 +52,7 @@ def get_gpus(selected_gpus):
 def start_local_trainers(cluster,
                          pod,
                          training_script,
+                         eager_mode,
                          training_script_args,
                          log_dir=None):
     current_env = copy.copy(os.environ.copy())
@@ -72,6 +73,9 @@ def start_local_trainers(cluster,
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
         }
 
+        if not eager_mode:
+            proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
+
         current_env.update(proc_env)
 
         print("trainer proc env:{}".format(current_env))
@@ -99,7 +103,7 @@ def start_local_trainers(cluster,
 
 
 class TestMultipleGpus(unittest.TestCase):
-    def run_mnist_2gpu(self, target_file_name):
+    def run_mnist_2gpu(self, target_file_name, eager_mode=True):
         if fluid.core.get_cuda_device_count() == 0:
             return
 
@@ -112,6 +116,7 @@ def run_mnist_2gpu(self, target_file_name):
         procs = start_local_trainers(
             cluster,
             pod,
+            eager_mode=eager_mode,
             training_script=target_file_name,
             training_script_args=[])
 
@@ -125,13 +130,17 @@ def run_mnist_2gpu(self, target_file_name):
 
     def test_hapi_multiple_gpus_static(self):
         self.run_mnist_2gpu('dist_hapi_mnist_static.py')
+        self.run_mnist_2gpu('dist_hapi_mnist_static.py', eager_mode=False)
 
     def test_hapi_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py')
+        self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py', eager_mode=False)
 
     def test_hapi_amp_static(self):
         self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py')
+        self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py', eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 242680bc7c738..e07ac47a0f818 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -123,6 +123,44 @@ def test_color_jitter(self):
         ])
         self.do_transform(trans)
 
+    def test_affine(self):
+        trans = transforms.Compose([
+            transforms.RandomAffine(90),
+            transforms.RandomAffine(
+                [-10, 10], translate=[0.1, 0.3]),
+            transforms.RandomAffine(
+                45, translate=[0.2, 0.2], scale=[0.2, 0.5]),
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[-10, 10]),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40]),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear'),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear',
+                fill=114),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear',
+                fill=114,
+                center=(60, 80)),
+        ])
+        self.do_transform(trans)
+
     def test_rotate(self):
         trans = transforms.Compose([
             transforms.RandomRotation(90),
@@ -134,6 +172,14 @@ def test_rotate(self):
         ])
         self.do_transform(trans)
 
+    def test_perspective(self):
+        trans = transforms.Compose([
+            transforms.RandomPerspective(prob=1.0),
+            transforms.RandomPerspective(
+                prob=1.0, distortion_scale=0.9),
+        ])
+        self.do_transform(trans)
+
     def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
@@ -278,6 +324,35 @@ def test_exception(self):
             tensor_img = paddle.rand((3, 100, 100))
             F.pad(tensor_img, [1.0, 2.0, 3.0])
 
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(-10)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine([-30, 60], translate=[2, 2])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 0, 20, 40])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                fill=114,
+                center=(1, 2, 3))
+
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
 
@@ -383,6 +458,20 @@ def test_color_jitter(self):
         trans = transforms.Compose([transforms.ColorJitter(1.1, 2.2, 0.8, 0.1)])
         self.do_transform(trans)
 
+        color_jitter_trans = transforms.ColorJitter(1.2, 0.2, 0.5, 0.2)
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = color_jitter_trans(batch_input)
+
+    def test_perspective(self):
+        trans = transforms.RandomPerspective(prob=1.0, distortion_scale=0.7)
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = trans(batch_input)
+
+    def test_affine(self):
+        trans = transforms.RandomAffine(15, translate=[0.1, 0.1])
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = trans(batch_input)
+
     def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
@@ -433,6 +522,10 @@ def test_erase(self):
         ])
         self.do_transform(trans)
 
+        erase_trans = transforms.RandomErasing(value=(0.5, 0.2, 0.01))
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = erase_trans(batch_input)
+
     def test_exception(self):
         trans = transforms.Compose([transforms.Resize(-1)])
 
@@ -479,6 +572,29 @@ def test_exception(self):
             tensor_img = paddle.rand((3, 100, 100))
             F.pad(tensor_img, [1.0, 2.0, 3.0])
 
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(-10)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine([-30, 60], translate=[2, 2])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[-2, -1]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 0, 20, 40])
+
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
 
@@ -547,6 +663,36 @@ def test_errors(self):
         with self.assertRaises(TypeError):
             F.adjust_saturation(1, 0.1)
 
+        with self.assertRaises(TypeError):
+            F.affine('45')
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=0.3)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2, 0.3])
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=-0.5)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=0.5, shear=10)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 0, 10])
+
+        with self.assertRaises(TypeError):
+            F.affine(
+                45,
+                translate=[0.2, 0.2],
+                scale=0.5,
+                shear=[-10, 10],
+                interpolation=2)
+
+        with self.assertRaises(TypeError):
+            F.affine(
+                45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10], center=0)
+
         with self.assertRaises(TypeError):
             F.rotate(1, 0.1)
 
@@ -785,6 +931,31 @@ def test_image_load(self):
 
         os.remove(path)
 
+    def test_affine(self):
+        np_img = (np.random.rand(32, 26, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+        tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
+
+        np.testing.assert_almost_equal(
+            np_img, tensor_img.transpose((1, 2, 0)), decimal=4)
+
+        np_affined_img = F.affine(
+            np_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+        pil_affined_img = F.affine(
+            pil_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+        tensor_affined_img = F.affine(
+            tensor_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+
+        np.testing.assert_equal(np_affined_img.shape,
+                                np.array(pil_affined_img).shape)
+        np.testing.assert_equal(np_affined_img.shape,
+                                tensor_affined_img.transpose((1, 2, 0)).shape)
+
+        np.testing.assert_almost_equal(
+            np.array(pil_affined_img),
+            tensor_affined_img.numpy().transpose((1, 2, 0)),
+            decimal=4)
+
     def test_rotate(self):
         np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img).convert('RGB')
@@ -819,6 +990,144 @@ def test_rotate1(self):
         np.testing.assert_equal(rotated_np_img.shape,
                                 np.array(rotated_pil_img).shape)
 
+    def test_perspective(self):
+        np_img = (np.random.rand(32, 26, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+        tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
+
+        np.testing.assert_almost_equal(
+            np_img, tensor_img.transpose((1, 2, 0)), decimal=4)
+
+        startpoints = [[0, 0], [13, 0], [13, 15], [0, 15]]
+        endpoints = [[3, 2], [12, 3], [10, 14], [2, 15]]
+
+        np_perspectived_img = F.perspective(np_img, startpoints, endpoints)
+        pil_perspectived_img = F.perspective(pil_img, startpoints, endpoints)
+        tensor_perspectived_img = F.perspective(tensor_img, startpoints,
+                                                endpoints)
+
+        np.testing.assert_equal(np_perspectived_img.shape,
+                                np.array(pil_perspectived_img).shape)
+        np.testing.assert_equal(np_perspectived_img.shape,
+                                tensor_perspectived_img.transpose(
+                                    (1, 2, 0)).shape)
+
+        result_pil = np.array(pil_perspectived_img)
+        result_tensor = tensor_perspectived_img.numpy().transpose(
+            (1, 2, 0)).astype('uint8')
+        num_diff_pixels = (result_pil != result_tensor).sum() / 3.0
+        ratio_diff_pixels = num_diff_pixels / result_tensor.shape[
+            0] / result_tensor.shape[1]
+        # Tolerance : less than 6% of different pixels
+        assert ratio_diff_pixels < 0.06
+
+    def test_batch_input(self):
+        paddle.seed(777)
+        batch_tensor = paddle.rand((2, 3, 8, 8), dtype=paddle.float32)
+
+        def test_erase(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.erase(input1, 1, 1, 2, 2, 0.5),
+                F.erase(input2, 1, 1, 2, 2, 0.5)
+            ])
+
+            batch_result = F.erase(batch_tensor, 1, 1, 2, 2, 0.5)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_erase(batch_tensor))
+
+        def test_affine(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.affine(
+                    input1,
+                    45,
+                    translate=[0.2, 0.2],
+                    scale=0.5,
+                    shear=[-10, 10]), F.affine(
+                        input2,
+                        45,
+                        translate=[0.2, 0.2],
+                        scale=0.5,
+                        shear=[-10, 10])
+            ])
+            batch_result = F.affine(
+                batch_tensor,
+                45,
+                translate=[0.2, 0.2],
+                scale=0.5,
+                shear=[-10, 10])
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_affine(batch_tensor))
+
+        def test_perspective(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            startpoints = [[0, 0], [3, 0], [4, 5], [6, 7]]
+            endpoints = [[0, 1], [3, 1], [4, 4], [5, 7]]
+            target_result = paddle.stack([
+                F.perspective(input1, startpoints, endpoints),
+                F.perspective(input2, startpoints, endpoints)
+            ])
+
+            batch_result = F.perspective(batch_tensor, startpoints, endpoints)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_perspective(batch_tensor))
+
+        def test_adjust_brightness(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_brightness(input1, 2.1),
+                F.adjust_brightness(input2, 2.1)
+            ])
+
+            batch_result = F.adjust_brightness(batch_tensor, 2.1)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_brightness(batch_tensor))
+
+        def test_adjust_contrast(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_contrast(input1, 0.3), F.adjust_contrast(input2, 0.3)
+            ])
+
+            batch_result = F.adjust_contrast(batch_tensor, 0.3)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_contrast(batch_tensor))
+
+        def test_adjust_saturation(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_saturation(input1, 1.1),
+                F.adjust_saturation(input2, 1.1)
+            ])
+
+            batch_result = F.adjust_saturation(batch_tensor, 1.1)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_saturation(batch_tensor))
+
+        def test_adjust_hue(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack(
+                [F.adjust_hue(input1, -0.2), F.adjust_hue(input2, -0.2)])
+
+            batch_result = F.adjust_hue(batch_tensor, -0.2)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_hue(batch_tensor))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index f89c65a492913..fade8b7495d43 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -714,6 +714,7 @@
     backend : x
   inplace : (x -> out)
   view : (x -> out)
+  # intermediate : xshape
   backward : flatten_grad
 
 # flip
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 717870ee01d0a..8483325221eb4 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -32,11 +32,7 @@ def __init__(self, api_item_yaml):
         #     names : [], list of output names
         #     types : [], list of output types
         #     out_size_expr : [], expression for getting size of vector<Tensor>
-        #     return_type : Tensor, vector<Tensor>, ..., the return type of api
-        # args_str:
-        #     args_declare : "str" // str of function params with default value. Example: (..., bool flag=false)
-        #     args_define : "str" // str of function params without default value. Example: (..., bool flag)
-        self.inputs, self.attrs, self.outputs, self.args_str, self.optional_vars = self.parse_args(
+        self.inputs, self.attrs, self.outputs, self.optional_vars = self.parse_args(
             self.api, api_item_yaml)
 
         self.is_base_api = True
@@ -60,22 +56,54 @@ def get_api_name(self, api_item_yaml):
     def get_api_func_name(self):
         return self.api
 
+    def get_input_tensor_args(self, inplace_flag=False):
+        input_args = []
+        inplace_type_map = {
+            "const Tensor&": "Tensor&",
+            "const std::vector<Tensor>&": "std::vector<Tensor>&"
+        }
+        for name in self.inputs['names']:
+            name = name.split('@')[0]
+            if inplace_flag and name in self.inplace_map.values():
+                input_args.append(inplace_type_map[self.inputs['input_info'][
+                    name]] + ' ' + name)
+            else:
+                input_args.append(self.inputs['input_info'][name] + ' ' + name)
+        return input_args
+
+    def get_declare_args(self, inplace_flag=False):
+        declare_args = self.get_input_tensor_args(inplace_flag)
+        for name in self.attrs['names']:
+            default_value = ''
+            if self.attrs['attr_info'][name][1] is not None:
+                default_value = ' = ' + self.attrs['attr_info'][name][1]
+            declare_args.append(self.attrs['attr_info'][name][0] + ' ' + name +
+                                default_value)
+
+        return ", ".join(declare_args)
+
+    def get_define_args(self, inplace_flag=False):
+        define_args = self.get_input_tensor_args(inplace_flag)
+        for name in self.attrs['names']:
+            define_args.append(self.attrs['attr_info'][name][0] + ' ' + name)
+
+        return ", ".join(define_args)
+
     def parse_args(self, api_name, api_item_yaml):
         optional_vars = []
         if 'optional' in api_item_yaml:
             optional_vars = [
                 item.strip() for item in api_item_yaml['optional'].split(',')
             ]
-        inputs, attrs, args_str = self.parse_input_and_attr(
+        inputs, attrs = self.parse_input_and_attr(
             api_name, api_item_yaml['args'], optional_vars)
-        output_type_list, output_names, out_size_expr, return_type = self.parse_output(
+        output_type_list, output_names, out_size_expr = self.parse_output(
             api_name, api_item_yaml['output'])
         return inputs, attrs, {
             'names': output_names,
             'types': output_type_list,
-            'out_size_expr': out_size_expr,
-            'return_type': return_type
-        }, args_str, optional_vars
+            'out_size_expr': out_size_expr
+        }, optional_vars
 
     def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
         inputs = {'names': [], 'input_info': {}}
@@ -125,9 +153,6 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'DataType': 'paddle::optional<DataType>'
         }
 
-        args_declare_str = ""
-        args_define_str = ""
-
         for item in args_list:
             item = item.strip()
             type_and_name = item.split(' ')
@@ -146,8 +171,6 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
 
                     inputs['names'].append(input_name)
                     inputs['input_info'][input_name] = in_type
-                    args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
-                    args_define_str = args_define_str + in_type + ' ' + input_name + ', '
                     has_input = True
                     break
             if has_input:
@@ -169,16 +192,11 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
                         attr_type = optional_types_trans[attr_type_symbol]
 
                     default_value_str = "" if default_value is None else '=' + default_value
-                    args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
-                    args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
                     attrs['names'].append(attr_name)
                     attrs['attr_info'][attr_name] = (attr_type, default_value)
                     break
 
-        return inputs, attrs, {
-            'args_declare': args_declare_str[:-2],
-            'args_define': args_define_str[:-2]
-        }
+        return inputs, attrs
 
     def parse_output(self, api_name, output_config):
         def parse_output_item(output_item):
@@ -205,8 +223,7 @@ def parse_output_item(output_item):
 
         if len(temp_list) == 1:
             out_type, out_name, size_expr = parse_output_item(temp_list[0])
-            return [out_type], [out_name], size_expr, self.get_return_type(
-                [out_type])
+            return [out_type], [out_name], size_expr
         else:
             out_type_list = []
             out_name_list = []
@@ -215,8 +232,7 @@ def parse_output_item(output_item):
                 out_type_list.append(out_type)
                 out_name_list.append(out_name)
 
-            return out_type_list, out_name_list, size_expr, self.get_return_type(
-                out_type_list)
+            return out_type_list, out_name_list, size_expr
 
     def parse_infer_meta(self, infer_meta_config):
         infer_meta = infer_meta_config
@@ -279,7 +295,7 @@ def parse_data_transform(self, api_item_yaml):
         return data_transform
 
     def parse_inplace_and_view(self, api_item_yaml):
-        inplace_map, view_map = None, None
+        inplace_map, view_map = {}, {}
         for mode in ['inplace', 'view']:
             if mode in api_item_yaml:
                 if mode == 'inplace':
@@ -304,17 +320,22 @@ def parse_inplace_and_view(self, api_item_yaml):
         return inplace_map, view_map
 
     # Override by child class
-    def get_return_type(self, out_type_list):
+    def get_return_type(self, inplace_flag=False):
         return None
 
     def gene_api_declaration(self):
-        api_declaration = f"""
-PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.args_str['args_declare']});
+        api_declaration = ""
+        api_func_name = self.get_api_func_name()
+        if api_func_name[-1] != '_':
+            api_declaration = f"""
+PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args()});
 """
 
-        if self.is_base_api and self.inplace_map is not None:
+        if self.is_base_api and len(self.inplace_map) > 0:
+            if api_func_name[-1] != '_':
+                api_func_name += '_'
             api_declaration = api_declaration + f"""
-PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
+PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
 """
 
         return api_declaration
@@ -513,7 +534,7 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
 {code_indent}  auto {out_name}_{PREFIX_META_TENSOR_NAME}vec = MakeMetaTensor({out_name});
 {code_indent}  std::vector<phi::MetaTensor*> {out_name}_metas({out_name}_{PREFIX_META_TENSOR_NAME}vec.size());
 {code_indent}  for (size_t i = 0; i < {out_name}_{PREFIX_META_TENSOR_NAME}vec.size(); ++i) {{
-{code_indent}    {out_name}_metas[i] = &{out_name}_{PREFIX_META_TENSOR_NAME}vec[i];
+{code_indent}    {out_name}_metas[i] = {out_name}[i] ? &{out_name}_{PREFIX_META_TENSOR_NAME}vec[i] : nullptr;
 {code_indent}  }}"""
 
                 param_code = param_code + out_name + '_metas, '
@@ -521,8 +542,10 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
                 meta_tensor_code = meta_tensor_code + code_indent + "  phi::MetaTensor " + out_name.replace(
                     'kernel_',
                     PREFIX_META_TENSOR_NAME) + "(" + out_name + ");\n"
-                param_code = param_code + "&" + out_name.replace(
-                    'kernel_', PREFIX_META_TENSOR_NAME) + ", "
+                if len(kernel_output_names) == 1:
+                    param_code = param_code + f"&{out_name.replace('kernel_', PREFIX_META_TENSOR_NAME)}, "
+                else:
+                    param_code = param_code + f"{out_name} ? &{out_name.replace('kernel_', PREFIX_META_TENSOR_NAME)} : nullptr, "
 
         param_code = param_code[:-2]
         return f"""{meta_tensor_code}
@@ -706,13 +729,9 @@ def get_selected_rows_kernel_args(self, code_indent):
 
         return input_tensor_code, kernel_args[:-2], kernel_signature
 
-    # Override by child class
-    def gene_return_type_code(self):
-        return self.outputs['return_type']
-
     # Override by child class
     def gene_return_code(self):
-        return "api_output"
+        return "return api_output;"
 
     # Override by child class
     def gene_output(self,
@@ -748,7 +767,7 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
 {code_indent}  }}
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         input_tensors, kernel_args, kernel_signature = self.get_selected_rows_kernel_args(
@@ -775,12 +794,14 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
 {code_indent}  }}
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gene_base_api_code(self, inplace_flag=False):
-        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
+        api_func_name = self.get_api_func_name()
+        if inplace_flag and api_func_name[-1] != '_':
+            api_func_name += '_'
         api_code = f"""
-PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
 {self.gene_kernel_select()}
 """
 
@@ -802,10 +823,16 @@ def gene_base_api_code(self, inplace_flag=False):
 }
 """
 
+    def gene_invoke_code(self, invoke_code, params_code):
+        return f"""
+PADDLE_API {self.get_return_type()} {self.api}({params_code}) {{
+  return {invoke_code};
+}}"""
+
     def gene_api_code(self):
         if self.is_base_api:
             api_code = self.gene_base_api_code()
-            if self.inplace_map is not None:
+            if len(self.inplace_map) > 0:
                 api_code = api_code + self.gene_base_api_code(inplace_flag=True)
             return api_code
 
@@ -821,12 +848,8 @@ def adjust_name(matched):
 
                 invoke_code = re.sub(pattern, adjust_name, self.invoke)
                 params_code = re.sub(pattern, adjust_name,
-                                     self.args_str["args_define"])
+                                     self.get_define_args())
             else:
                 invoke_code = self.invoke
-                params_code = self.args_str["args_define"]
-            return f"""
-{self.outputs['return_type']} {self.api}({params_code}) {{
-  return {invoke_code};
-}}
-"""
+                params_code = self.get_define_args()
+            return self.gene_invoke_code(invoke_code, params_code)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index fa5666b493b38..0de60c14d3a42 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -19,6 +19,11 @@
 
 from api_base import BaseAPI, PREFIX_TENSOR_NAME
 
+inplace_out_type_map = {
+    "Tensor": "Tensor&",
+    "std::vector<Tensor>": "std::vector<Tensor>&"
+}
+
 
 class ForwardAPI(BaseAPI):
     def __init__(self, api_item_yaml):
@@ -42,38 +47,49 @@ def parse_intermediate(self, api_item_yaml):
         else:
             return False, []
 
-    def get_return_type(self, out_type_list):
-        return out_type_list[0] if len(
-            out_type_list) == 1 else "std::tuple<" + ",".join(
-                out_type_list) + ">"
+    def get_return_type_with_intermediate(self, inplace_flag=False):
+        out_type_list = []
+        for i, out_type in enumerate(self.outputs['types']):
+            out_name = self.outputs['names'][i].split('@')[0]
+            if inplace_flag and out_name in self.inplace_map:
+                out_type_list.append(inplace_out_type_map[out_type])
+            else:
+                out_type_list.append(out_type)
 
-    def gene_return_type_code(self):
-        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
-            return self.outputs['return_type']
+        if len(out_type_list) == 1:
+            return out_type_list[0]
         else:
-            return_out_list = []
-            for i, name in enumerate(self.outputs['names']):
-                if name not in self.intermediate_outs:
-                    return_out_list.append(self.outputs['types'][i])
-            return return_out_list[0] if len(
-                return_out_list) == 1 else "std::tuple<" + ",".join(
-                    return_out_list) + ">"
+            return "std::tuple<" + ", ".join(out_type_list) + ">"
+
+    def get_return_type(self, inplace_flag=False):
+        out_type_list = []
+        for i, out_type in enumerate(self.outputs['types']):
+            out_name = self.outputs['names'][i].split('@')[0]
+            if inplace_flag and out_name in self.inplace_map:
+                out_type_list.append(inplace_out_type_map[out_type])
+            elif self.is_dygraph_api or out_name not in self.intermediate_outs:
+                out_type_list.append(out_type)
+
+        if len(out_type_list) == 1:
+            return out_type_list[0]
+        else:
+            return "std::tuple<" + ", ".join(out_type_list) + ">"
 
     def gene_return_code(self):
         if self.is_dygraph_api or len(self.intermediate_outs) == 0:
-            return "api_output"
+            return "return api_output;"
         else:
             return_out_list = []
             for i, name in enumerate(self.outputs['names']):
-                if name not in self.intermediate_outs:
+                if name.split('@')[0] not in self.intermediate_outs:
                     return_out_list.append(i)
             if len(return_out_list) == 1:
-                return f"std::get<{return_out_list[0]}>(api_output)"
+                return f"return std::get<{return_out_list[0]}>(api_output);"
             else:
                 selected_code = [
                     f"std::get<{i}>(api_output)" for i in return_out_list
                 ]
-            return '{' + ", ".join(selected_code) + '}'
+            return 'return {' + ", ".join(selected_code) + '};'
 
     def gene_output(self,
                     output_type_list,
@@ -83,17 +99,18 @@ def gene_output(self,
         kernel_output = ""
         output_names = []
         output_create = ""
+        return_type = self.get_return_type_with_intermediate(inplace_flag)
 
         if len(output_type_list) == 1:
             kernel_output = 'kernel_out'
             output_names.append('kernel_out')
             inplace_assign = " = " + self.inplace_map[self.outputs['names'][
-                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
-                    'names'][0] in self.inplace_map else ""
+                0]] if inplace_flag and self.outputs['names'][
+                    0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};"""
+{code_indent}  {return_type} api_output{inplace_assign};"""
 
-            if self.outputs['return_type'] == 'std::vector<Tensor>':
+            if return_type == 'std::vector<Tensor>':
                 assert self.outputs['out_size_expr'] is not None, \
                      f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
@@ -112,15 +129,23 @@ def gene_output(self,
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output;"""
+{code_indent}  {return_type} api_output;"""
+
+            if inplace_flag:
+                output_create = f"""
+{code_indent}  {return_type} api_output{{"""
+
+                for out_name in self.outputs['names']:
+                    if out_name in self.inplace_map:
+                        output_create = output_create + self.inplace_map[
+                            out_name] + ', '
+                    else:
+                        output_create += 'Tensor(), '
+                output_create = output_create[:-2] + '};'
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
-                if inplace_flag and self.inplace_map is not None and self.outputs[
-                        'names'][i] in self.inplace_map:
-                    output_create = output_create + f"""
-{code_indent}  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 if output_type_list[i] == 'std::vector<Tensor>':
                     assert self.outputs['out_size_expr'][i] is not None, \
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index ba9563a1b1f54..3e19a915c1993 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -71,7 +71,7 @@
   forward : add_n (Tensor[] x) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad)
   output : Tensor[](x_grad){x.size()}
-  invoke : add_n_grad_impl(x, out_grad)
+  invoke : add_n_grad_impl(x, out_grad, x_grad)
   no_need_buffer : x
 
 - backward_api : add_triple_grad
@@ -251,6 +251,16 @@
   kernel :
     func : cholesky_solve_grad
 
+- backward_api : clip_double_grad
+  forward : clip_grad (Tensor x, Tensor grad_out, Scalar min = 0., Scalar max = 0.) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_x_grad, Scalar min = 0., Scalar max = 0.)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : clip_grad
+
 - backward_api : clip_grad
   forward : clip (Tensor x, Scalar min, Scalar max) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, Scalar min = 0., Scalar max = 0.)
@@ -260,6 +270,17 @@
     param : [x]
   kernel :
     func : clip_grad
+  backward : clip_double_grad
+
+- backward_api : concat_double_grad
+  forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x)
+  args : (Tensor[] grad_x_grad, Scalar axis = 0)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : ConcatInferMeta
+    param : [grad_x_grad, axis]
+  kernel :
+    func : concat
 
 - backward_api : concat_grad
   forward : concat (Tensor[] x, Scalar axis) -> Tensor(out)
@@ -271,6 +292,7 @@
   kernel :
     func : concat_grad
   no_need_buffer : x
+  backward : concat_double_grad
 
 - backward_api : conj_grad
   forward : conj (Tensor x) -> Tensor(out)
@@ -286,7 +308,7 @@
   forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
   args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor(input_grad), Tensor(filter_grad)
-  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
   backward : conv2d_grad_grad
 
 - backward_api : conv2d_grad_grad
@@ -301,6 +323,16 @@
     use_gpudnn : true
   optional : grad_input_grad, grad_filter_grad
 
+- backward_api : conv2d_transpose_double_grad
+  forward : conv2d_transpose_grad(Tensor x, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(grad_x), Tensor(grad_filter)
+  args : (Tensor x, Tensor filter, Tensor grad_out, Tensor grad_x_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
+  output : Tensor(x_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : Conv2dTransposeDoubleGradInferMeta
+  kernel :
+    func : conv2d_transpose_grad_grad
+    use_gpudnn : true
+
 - backward_api : conv2d_transpose_grad
   forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -310,6 +342,7 @@
   kernel :
     func : conv2d_transpose_grad
     use_gpudnn : true
+  backward : conv2d_transpose_double_grad
 
 - backward_api : conv3d_transpose_grad
   forward : conv3d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
@@ -570,6 +603,15 @@
     func : expand_as_grad
   no_need_buffer : x
 
+- backward_api : expand_double_grad
+  forward : expand_grad (Tensor x, Tensor grad_out, IntArray shape) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray shape)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : ExpandInferMeta
+  kernel :
+    func : expand
+
 - backward_api : expand_grad
   forward : expand (Tensor x, IntArray shape) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray shape)
@@ -580,6 +622,7 @@
   kernel :
     func : expand_grad
   no_need_buffer : x
+  backward : expand_double_grad
 
 - backward_api : expm1_grad
   forward : expm1 (Tensor x) -> Tensor(out)
@@ -753,7 +796,7 @@
   forward : imag (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
-  invoke : imag_grad_impl(out_grad)
+  invoke : imag_grad_impl(out_grad, x_grad)
 
 - backward_api : index_sample_grad
   forward : index_sample (Tensor x, Tensor index) -> Tensor(out)
@@ -1381,7 +1424,7 @@
   forward : real (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
-  invoke : real_grad_impl(out_grad)
+  invoke : real_grad_impl(out_grad, x_grad)
 
 - backward_api : reciprocal_grad
   forward : reciprocal (Tensor x) -> Tensor(out)
@@ -1782,7 +1825,7 @@
   forward : sum_double_grad (Tensor grad_grad_x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(grad_grad_out)
   args : (Tensor grad_grad_x, Tensor grad_grad_out_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false)
   output : Tensor(grad_grad_x_grad)
-  invoke : sum_grad(grad_grad_x, grad_grad_out_grad, dims, keep_dim, reduce_all)
+  invoke : sum_grad(grad_grad_x, grad_grad_out_grad, dims, keep_dim, reduce_all, grad_grad_x_grad)
 
 - backward_api : swish_grad
   forward : swish (Tensor x, float beta=1.0) -> Tensor(out)
@@ -1866,6 +1909,15 @@
   kernel :
     func : thresholded_relu_grad
 
+- backward_api : tile_double_grad
+  forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray repeat_times)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : TileInferMeta
+  kernel :
+    func : tile
+
 - backward_api : tile_grad
   forward : tile (Tensor x, IntArray repeat_times) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray repeat_times)
@@ -1876,6 +1928,7 @@
   kernel :
     func : tile_grad
   no_need_buffer : x
+  backward : tile_double_grad
 
 - backward_api : top_k_grad
   forward : top_k (Tensor x, Scalar k, int axis = -1, bool largest = true, bool sorted = true) -> Tensor(out), Tensor(indices)
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index e7ef1423598d6..76e1eecdc0020 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -35,10 +35,10 @@ def parse_forward_config(self, forward_config):
             r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
             forward_config)
         api = result.group('api')
-        _, outputs, _, _ = self.parse_output(self.api, result.group('outputs'))
+        _, outputs, _, = self.parse_output(self.api, result.group('outputs'))
         outputs = [item.split('@')[0] for item in outputs]
-        fw_inputs, fw_attrs, _, = self.parse_input_and_attr(
-            api, result.group('args'))
+        fw_inputs, fw_attrs = self.parse_input_and_attr(api,
+                                                        result.group('args'))
 
         return api, fw_inputs, fw_attrs, outputs
 
@@ -77,6 +77,25 @@ def check_args(self, forward_config):
             f"{self.api} : Output error: The number of outputs should be less then the number of inputs of forward api. \
              Please check the output of {self.api} in yaml."
 
+    def get_declare_args(self, inplace_flag=False):
+        return self.get_define_args()
+
+    def get_define_args(self, inplace_flag=False):
+        out_type_map = {
+            'Tensor': 'Tensor*',
+            'std::vector<Tensor>': 'std::vector<Tensor*>'
+        }
+        intputs_and_attrs = super(BackwardAPI, self).get_define_args()
+        outs = []
+        for i, name in enumerate(self.outputs['names']):
+            outs.append(out_type_map[self.outputs['types'][i]] + ' ' +
+                        name.split('@')[0])
+        result = intputs_and_attrs + ', ' + ", ".join(outs)
+        return result
+
+    def gene_return_code(self):
+        return ""
+
     def gene_kernel_backend_select(self):
         all_no_need_buffer = True
         for in_name in self.inputs['names']:
@@ -90,9 +109,8 @@ def gene_kernel_backend_select(self):
         else:
             return super().gene_kernel_backend_select()
 
-    def get_return_type(self, out_type_list):
-        return out_type_list[0] if len(
-            out_type_list) == 1 else "std::vector<std::vector<Tensor>>"
+    def get_return_type(self, inplace_flag=False):
+        return 'void'
 
     def gene_output(self,
                     output_type_list,
@@ -109,23 +127,19 @@ def gene_output(self,
             inplace_assign = " = " + self.inplace_map[self.outputs['names'][
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
-            output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};"""
-
+            output_create = ""
             if output_type_list[0] == 'std::vector<Tensor>':
                 assert self.outputs['out_size_expr'] is not None, \
                      f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr']}, kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}(&{self.outputs['names'][0]});"""
 
             else:
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, {self.outputs['names'][0]});"""
 
         elif len(output_type_list) > 1:
-            output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output({len(output_type_list)});"""
-
+            output_create = ""
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
@@ -133,26 +147,21 @@ def gene_output(self,
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
-
-                    else:
-                        output_create = output_create + f"""
-{code_indent}  api_output[{i}].emplace_back();"""
+{code_indent}  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                     output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &api_output[{i}][0]);"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, {self.outputs['names'][i]});"""
 
                 else:
-                    get_out_code = f'&api_output[{i}]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                     assert self.outputs['out_size_expr'][i] is not None, \
                         f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                     output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}({self.outputs['out_size_expr'][i]}, kernel_backend, &api_output[{i}]);"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(&{self.outputs['names'][i]});"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -162,6 +171,21 @@ def gene_output(self,
 
         return kernel_output, output_names, output_create
 
+    def gene_invoke_code(self, invoke_code, params_code):
+        inveke_func_name = invoke_code.split('(')[0].strip()
+        if inveke_func_name.endswith('_grad') or inveke_func_name.endswith(
+                '_grad_impl'):
+            return f"""
+PADDLE_API {self.get_return_type()} {self.api}({params_code}) {{
+  {invoke_code};
+}}"""
+
+        else:
+            return f"""
+PADDLE_API {self.get_return_type()} {self.api}({params_code}) {{
+  *{self.outputs['names'][0].split('@')[0]} = {invoke_code};
+}}"""
+
 
 def header_include():
     return """
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index ca4330f2af362..ae3e9e6942233 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -7,6 +7,38 @@
   intermediate : rulebook
   backward : conv3d_grad
 
+- api : coo_relu
+  args : (Tensor x)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_relu
+    layout : x
+  backward : sparse_coo_relu_grad
+
+- api : coo_sin
+  args : (Tensor x)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_sin
+    layout : x
+  backward : sparse_coo_sin_grad
+
+- api : coo_sqrt
+  args : (Tensor x)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_sqrt
+    layout : x
+  backward : sparse_coo_sqrt_grad
+
+- api : coo_tanh
+  args : (Tensor x)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_tanh
+    layout : x
+  backward : sparse_coo_tanh_grad
+
 - api : coo_to_dense
   args : (Tensor x)
   output : Tensor(out@DenseTensor)
@@ -30,6 +62,34 @@
     data_type : values
   backward : create_sparse_coo_tensor_grad
 
+- api : csr_relu
+  args : (Tensor x)
+  output : Tensor(out@SparseCsrTensor)
+  kernel :
+    func : sparse_csr_relu
+    layout : x
+
+- api : csr_sin
+  args : (Tensor x)
+  output : Tensor(out@SparseCsrTensor)
+  kernel :
+    func : sparse_csr_sin
+    layout : x
+
+- api : csr_sqrt
+  args : (Tensor x)
+  output : Tensor(out@SparseCsrTensor)
+  kernel :
+    func : sparse_csr_sqrt
+    layout : x
+
+- api : csr_tanh
+  args : (Tensor x)
+  output : Tensor(out@SparseCsrTensor)
+  kernel :
+    func : sparse_csr_tanh
+    layout : x
+
 - api : csr_values
   args : (Tensor x)
   output : Tensor(out@DenseTensor)
@@ -43,14 +103,6 @@
   invoke : to_sparse_coo_impl(x, sparse_dim)
   backward : dense_to_coo_grad
 
-- api : relu
-  args : (Tensor x)
-  output : Tensor(out@SparseCooTensor)
-  kernel :
-    func : sparse_relu
-    layout : x
-  backward : sparse_relu_grad
-
 - api : to_dense
   args : (Tensor x)
   output : Tensor(out@DenseTensor)
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index c0316fc164294..509858d339f69 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -27,7 +27,7 @@ def __init__(self, api_item_yaml):
     def gene_api_declaration(self):
         return f"""
 // {", ".join(self.outputs['names'])}
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+{super(SparseAPI, self).gene_api_declaration()}
 """
 
     def get_kernel_tensor_out_type(self, output_name):
@@ -46,6 +46,7 @@ def gene_output(self,
         kernel_output = ""
         output_names = []
         output_create = ""
+        return_type = self.get_return_type_with_intermediate(inplace_flag)
 
         if len(output_type_list) == 1:
             kernel_output = 'kernel_out'
@@ -54,21 +55,29 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} api_output{inplace_assign};
+  {return_type} api_output{inplace_assign};
   auto* kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-  {self.outputs['return_type']} api_output;"""
+  {return_type} api_output;"""
+
+            if inplace_flag:
+                output_create = f"""
+  {return_type} api_output{{"""
+
+                for out_name in self.outputs['names']:
+                    out_name = out_name.split('@')[0]
+                    if out_name in self.inplace_map:
+                        output_create = output_create + self.inplace_map[
+                            out_name] + ', '
+                    else:
+                        output_create += 'Tensor(), '
+                output_create = output_create[:-2] + '};'
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
-                if inplace_flag and self.inplace_map is not None and self.outputs[
-                        'names'][i] in self.inplace_map:
-                    output_create = output_create + f"""
-  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
-
                 output_create = output_create + f"""
   auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(api_output), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
 
@@ -136,7 +145,8 @@ def gen_sparse_kernel_code(self, inplace_flag=False):
 
         kernel_context_code = self.gen_sparse_kernel_context(
             kernel_output_names)
-
+        return_code = "" if len(self.gene_return_code(
+        )) == 0 else "  " + self.gene_return_code()
         return f"""
   auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
@@ -148,13 +158,14 @@ def gen_sparse_kernel_code(self, inplace_flag=False):
 {output_create}
 {kernel_context_code}
   phi_kernel(&kernel_context);
-
-  return api_output;"""
+{return_code}"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name()
+        if inplace_flag and api_func_name[-1] != '_':
+            api_func_name += '_'
         return f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.get_return_type()} {api_func_name}({self.get_define_args()}) {{
 {self.gene_kernel_select()}
 {self.gen_sparse_kernel_code(inplace_flag)}
 }}
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index 74299ed3e39a0..d8e8aad8f98b2 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -32,16 +32,37 @@
   output : Tensor(x_grad@DenseTensor)
   invoke : to_dense_impl(out_grad)
 
-- backward_api : sparse_maxpool_grad
-  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+- backward_api : sparse_coo_relu_grad
+  forward : sparse_coo_relu(Tensor x) -> Tensor(out@SparseCooTensor)
+  args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad@SparseCooTensor)
   kernel :
-    func : sparse_maxpool_grad
+    func : sparse_coo_relu_grad
 
-- backward_api : sparse_relu_grad
-  forward : sparse_relu(Tensor x) -> Tensor(out@SparseCooTensor)
+- backward_api : sparse_coo_sin_grad
+  forward : sparse_coo_sin(Tensor x) -> Tensor(out@SparseCooTensor)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad@SparseCooTensor)
   kernel :
-    func : sparse_relu_grad
+    func : sparse_coo_sin_grad
+
+- backward_api : sparse_coo_sqrt_grad
+  forward : sparse_coo_sqrt(Tensor x) -> Tensor(out@SparseCooTensor)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad@SparseCooTensor)
+  kernel :
+    func : sparse_coo_sqrt_grad
+
+- backward_api : sparse_coo_tanh_grad
+  forward : sparse_coo_tanh(Tensor x) -> Tensor(out@SparseCooTensor)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad@SparseCooTensor)
+  kernel :
+    func : sparse_coo_tanh_grad
+
+- backward_api : sparse_maxpool_grad
+  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  output : Tensor(x_grad@SparseCooTensor)
+  kernel :
+    func : sparse_maxpool_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index 4f209a7592161..53a99d798118e 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -31,12 +31,21 @@ def get_api_func_name(self):
     def gene_kernel_backend_select(self):
         return BackwardAPI.gene_kernel_backend_select(self)
 
-    def get_return_type(self, out_type_list):
-        return BackwardAPI.get_return_type(self, out_type_list)
+    def get_return_type(self, inplace_flag=False):
+        return BackwardAPI.get_return_type(self)
+
+    def gene_return_code(self):
+        return ""
 
     def gene_api_declaration(self):
         return SparseAPI.gene_api_declaration(self)
 
+    def get_declare_args(self, inplace_flag=False):
+        return BackwardAPI.get_declare_args(self)
+
+    def get_define_args(self, inplace_flag=False):
+        return BackwardAPI.get_define_args(self)
+
     def gene_output(self,
                     output_type_list,
                     set_out_func,
@@ -53,36 +62,21 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} api_output{inplace_assign};
-  auto kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+  auto kernel_out = {set_out_func}({self.outputs['names'][0].split('@')[0]}, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
 
         elif len(output_type_list) > 1:
-            output_create = f"""
-  {self.outputs['return_type']} api_output({len(output_type_list)});"""
+            output_create = ""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
-                if out_type_item == 'Tensor':
-                    get_out_code = f'&api_output[{i}][0]'
-                    if inplace_flag and self.inplace_map is not None and self.outputs[
-                            'names'][i] in self.inplace_map:
-                        output_create = output_create + f"""
-  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
-
-                    else:
-                        output_create = output_create + f"""
-  api_output[{i}].emplace_back();"""
-
-                else:
-                    get_out_code = f'&api_output[{i}]'
-                    if inplace_flag and self.inplace_map is not None and self.outputs[
-                            'names'][i] in self.inplace_map:
-                        output_create = output_create + f"""
-  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+                if inplace_flag and self.inplace_map is not None and self.outputs[
+                        'names'][i] in self.inplace_map:
+                    output_create = output_create + f"""
+  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-  auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+  auto kernel_out_{i} = {set_out_func}({self.outputs['names'][i].split('@')[0]}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
 
             kernel_output = kernel_output[:-2]
         else:
diff --git a/python/paddle/utils/code_gen/strings_api_gen.py b/python/paddle/utils/code_gen/strings_api_gen.py
index 061ea6c3ceef9..d697ce3935708 100644
--- a/python/paddle/utils/code_gen/strings_api_gen.py
+++ b/python/paddle/utils/code_gen/strings_api_gen.py
@@ -32,7 +32,7 @@ def get_api_func_name(self):
     def gene_api_declaration(self):
         return f"""
 // {", ".join(self.outputs['names'])}
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+{super(StringsAPI, self).gene_api_declaration()}
 """
 
     def get_kernel_tensor_out_type(self, output_name):
@@ -56,6 +56,7 @@ def gene_output(self,
         kernel_output = ""
         output_names = []
         output_create = ""
+        return_type = self.get_return_type(inplace_flag)
 
         if len(output_type_list) == 1:
             kernel_output = 'kernel_out'
@@ -67,13 +68,12 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} api_output{inplace_assign};
-  
+  {return_type} api_output{inplace_assign};
   {tensor_type}* kernel_out = dynamic_cast<{tensor_type}*>({set_out_func}(kernel_backend, &api_output, {kernel_tensor_out_type}));"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-  {self.outputs['return_type']} api_output;"""
+  {return_type} api_output;"""
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
@@ -194,7 +194,7 @@ def gen_string_tensor_kernel_code(self, inplace_flag=False, code_indent=""):
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gene_kernel_select(self) -> str:
         api = self.api
@@ -264,7 +264,7 @@ def gene_kernel_select(self) -> str:
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name()
         return f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
 {self.gene_kernel_select()}
 {self.gen_string_tensor_kernel_code(inplace_flag)}
 }}
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
index da7ae010c58f6..70aa1b833d648 100644
--- a/python/paddle/vision/models/mobilenetv3.py
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -39,7 +39,7 @@
 class SqueezeExcitation(nn.Layer):
     """
     This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
-    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
     This code is based on the torchvision code with modifications.
     You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127
     Args:
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index e4dd4c797fef6..d45c652885b69 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -895,7 +895,10 @@ def decode_jpeg(x, mode='unchanged', name=None):
 
     Examples:
         .. code-block:: python
+
+            # required: gpu
             import cv2
+            import numpy as np
             import paddle
 
             fake_img = (np.random.random(
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index b255e663e6876..5992a4f977411 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -28,7 +28,9 @@
 from .transforms import ColorJitter  # noqa: F401
 from .transforms import RandomCrop  # noqa: F401
 from .transforms import Pad  # noqa: F401
+from .transforms import RandomAffine  # noqa: F401
 from .transforms import RandomRotation  # noqa: F401
+from .transforms import RandomPerspective  # noqa: F401
 from .transforms import Grayscale  # noqa: F401
 from .transforms import ToTensor  # noqa: F401
 from .transforms import RandomErasing  # noqa: F401
@@ -37,7 +39,9 @@
 from .functional import vflip  # noqa: F401
 from .functional import resize  # noqa: F401
 from .functional import pad  # noqa: F401
+from .functional import affine  # noqa: F401
 from .functional import rotate  # noqa: F401
+from .functional import perspective  # noqa: F401
 from .functional import to_grayscale  # noqa: F401
 from .functional import crop  # noqa: F401
 from .functional import center_crop  # noqa: F401
@@ -64,7 +68,9 @@
     'ColorJitter',
     'RandomCrop',
     'Pad',
+    'RandomAffine',
     'RandomRotation',
+    'RandomPerspective',
     'Grayscale',
     'ToTensor',
     'RandomErasing',
@@ -73,7 +79,9 @@
     'vflip',
     'resize',
     'pad',
+    'affine',
     'rotate',
+    'perspective',
     'to_grayscale',
     'crop',
     'center_crop',
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 5a8c2cc09f884..90fba1c4130e5 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -537,6 +537,166 @@ def adjust_hue(img, hue_factor):
         return F_t.adjust_hue(img, hue_factor)
 
 
+def _get_affine_matrix(center, angle, translate, scale, shear):
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    # Ihe inverse one is : M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    # Rotate and Shear without scaling 
+    a = math.cos(rot - sy) / math.cos(sy)
+    b = -math.cos(rot - sy) * math.tan(sx) / math.cos(sy) - math.sin(rot)
+    c = math.sin(rot - sy) / math.cos(sy)
+    d = -math.sin(rot - sy) * math.tan(sx) / math.cos(sy) + math.cos(rot)
+
+    # Center Translation
+    cx, cy = center
+    tx, ty = translate
+
+    # Inverted rotation matrix with scale and shear
+    # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+    matrix = [d, -b, 0.0, -c, a, 0.0]
+    matrix = [x / scale for x in matrix]
+    # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+    matrix[2] += matrix[0] * (-cx - tx) + matrix[1] * (-cy - ty)
+    matrix[5] += matrix[3] * (-cx - tx) + matrix[4] * (-cy - ty)
+    # Apply center translation: C * RSS^-1 * C^-1 * T^-1
+    matrix[2] += cx
+    matrix[5] += cy
+
+    return matrix
+
+
+def affine(img,
+           angle,
+           translate,
+           scale,
+           shear,
+           interpolation="nearest",
+           fill=0,
+           center=None):
+    """Apply affine transformation on the image.
+
+    Args:
+        img (PIL.Image|np.array|paddle.Tensor): Image to be affined.
+        angle (int|float): The angle of the random rotation in clockwise order.
+        translate (list[float]): Maximum absolute fraction for horizontal and vertical translations.
+        scale (float): Scale factor for the image, scale should be positive.
+        shear (list[float]): Shear angle values which are parallel to the x-axis and y-axis in clockwise order.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        center (2-tuple, optional): Optional center of rotation, (x, y).
+            Origin is the upper left corner.
+            Default is the center of the image.
+
+    Returns:
+        PIL.Image|np.array|paddle.Tensor: Affine Transformed image.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import functional as F
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            affined_img = F.affine(fake_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+            print(affined_img.shape)
+    """
+
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
+        raise TypeError(
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if not isinstance(angle, (int, float)):
+        raise TypeError("Argument angle should be int or float")
+
+    if not isinstance(translate, (list, tuple)):
+        raise TypeError("Argument translate should be a sequence")
+
+    if len(translate) != 2:
+        raise ValueError("Argument translate should be a sequence of length 2")
+
+    if scale <= 0.0:
+        raise ValueError("Argument scale should be positive")
+
+    if not isinstance(shear, (numbers.Number, (list, tuple))):
+        raise TypeError(
+            "Shear should be either a single value or a sequence of two values")
+
+    if not isinstance(interpolation, str):
+        raise TypeError("Argument interpolation should be a string")
+
+    if isinstance(angle, int):
+        angle = float(angle)
+
+    if isinstance(translate, tuple):
+        translate = list(translate)
+
+    if isinstance(shear, numbers.Number):
+        shear = [shear, 0.0]
+
+    if isinstance(shear, tuple):
+        shear = list(shear)
+
+    if len(shear) == 1:
+        shear = [shear[0], shear[0]]
+
+    if len(shear) != 2:
+        raise ValueError(
+            f"Shear should be a sequence containing two values. Got {shear}")
+
+    if center is not None and not isinstance(center, (list, tuple)):
+        raise TypeError("Argument center should be a sequence")
+
+    if _is_pil_image(img):
+        width, height = img.size
+        # center = (width * 0.5 + 0.5, height * 0.5 + 0.5)
+        # it is visually better to estimate the center without 0.5 offset
+        # otherwise image rotated by 90 degrees is shifted vs output image of F_t.affine
+        if center is None:
+            center = [width * 0.5, height * 0.5]
+        matrix = _get_affine_matrix(center, angle, translate, scale, shear)
+        return F_pil.affine(img, matrix, interpolation, fill)
+
+    if _is_numpy_image(img):
+        # get affine_matrix in F_cv2.affine() using cv2's functions
+        width, height = img.shape[0:2]
+        # center = (width * 0.5 + 0.5, height * 0.5 + 0.5)
+        # it is visually better to estimate the center without 0.5 offset
+        # otherwise image rotated by 90 degrees is shifted vs output image of F_t.affine
+        if center is None:
+            center = (width * 0.5, height * 0.5)
+        return F_cv2.affine(img, angle, translate, scale, shear, interpolation,
+                            fill, center)
+
+    if _is_tensor_image(img):
+        center_f = [0.0, 0.0]
+        if center is not None:
+            height, width = img.shape[-1], img.shape[-2]
+            # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+            center_f = [
+                1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])
+            ]
+        translate_f = [1.0 * t for t in translate]
+        matrix = _get_affine_matrix(center_f, angle, translate_f, scale, shear)
+        return F_t.affine(img, matrix, interpolation, fill)
+
+
 def rotate(img,
            angle,
            interpolation="nearest",
@@ -607,6 +767,95 @@ def rotate(img,
         return F_cv2.rotate(img, angle, interpolation, expand, center, fill)
 
 
+def _get_perspective_coeffs(startpoints, endpoints):
+    """
+    get coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+
+    In Perspective Transform each pixel (x, y) in the original image gets transformed as,
+     (x, y) -> ( (ax + by + c) / (gx + hy + 1), (dx + ey + f) / (gx + hy + 1) )
+
+    Args:
+        startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+        endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+
+    Returns:
+        output (list): octuple (a, b, c, d, e, f, g, h) for transforming each pixel.
+    """
+    a_matrix = np.zeros((2 * len(startpoints), 8))
+
+    for i, (p1, p2) in enumerate(zip(endpoints, startpoints)):
+        a_matrix[2 * i, :] = [
+            p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]
+        ]
+        a_matrix[2 * i + 1, :] = [
+            0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]
+        ]
+
+    b_matrix = np.array(startpoints).reshape([8])
+    res = np.linalg.lstsq(a_matrix, b_matrix)[0]
+
+    output = list(res)
+    return output
+
+
+def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
+    """Perform perspective transform of the given image.
+
+    Args:
+        img (PIL.Image|np.array|paddle.Tensor): Image to be transformed.
+        startpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the original image.
+        endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+
+    Returns:
+        PIL.Image|np.array|paddle.Tensor: transformed Image.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import functional as F
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            startpoints = [[0, 0], [33, 0], [33, 25], [0, 25]]
+            endpoints = [[3, 2], [32, 3], [30, 24], [2, 25]]
+
+            perspectived_img = F.perspective(fake_img, startpoints, endpoints)
+            print(perspectived_img.shape)
+
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
+        raise TypeError(
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return F_pil.perspective(img, coeffs, interpolation, fill)
+    elif _is_tensor_image(img):
+        coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return F_t.perspective(img, coeffs, interpolation, fill)
+    else:
+        return F_cv2.perspective(img, startpoints, endpoints, interpolation,
+                                 fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
@@ -714,9 +963,33 @@ def erase(img, i, j, h, w, v, inplace=False):
 
                 import paddle
                 
-                fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
+                fake_img = paddle.randn((3, 2, 4)).astype(paddle.float32)
+                print(fake_img)
+
+                #Tensor(shape=[3, 2, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                #       [[[ 0.02169025, -0.97859967, -1.39175487, -1.07478464],
+                #         [ 0.20654772,  1.74624777,  0.32268861, -0.13857445]],
+                #
+                #        [[-0.14993843,  1.10793507, -0.40056887, -1.94395220],
+                #         [ 0.41686651,  0.44551995, -0.09356714, -0.60898107]],
+                #
+                #        [[-0.24998808, -1.47699273, -0.88838995,  0.42629015],
+                #         [ 0.56948012, -0.96200180,  0.53355658,  3.20450878]]])
+
                 values = paddle.zeros((1,1,1), dtype=paddle.float32)
-                result = paddle.vision.transforms.erase(fake_img, 4, 4, 3, 3, values)
+                result = paddle.vision.transforms.erase(fake_img, 0, 1, 1, 2, values)
+                
+                print(result)
+
+                #Tensor(shape=[3, 2, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                #       [[[ 0.02169025,  0.        ,  0.        , -1.07478464],
+                #         [ 0.20654772,  1.74624777,  0.32268861, -0.13857445]],
+                #
+                #         [[-0.14993843,  0.        ,  0.        , -1.94395220],
+                #           [ 0.41686651,  0.44551995, -0.09356714, -0.60898107]],
+                #
+                #         [[-0.24998808,  0.        ,  0.        ,  0.42629015],
+                #          [ 0.56948012, -0.96200180,  0.53355658,  3.20450878]]])
 
     """
     if _is_tensor_image(img):
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 8343a8c340ffb..1b2485541c499 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -411,6 +411,86 @@ def adjust_hue(img, hue_factor):
     return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
 
 
+def affine(img,
+           angle,
+           translate,
+           scale,
+           shear,
+           interpolation='nearest',
+           fill=0,
+           center=None):
+    """Affine the image by matrix.
+
+    Args:
+        img (PIL.Image): Image to be affined.
+        translate (sequence or int): horizontal and vertical translations
+        scale (float): overall scale ratio
+        shear (sequence or float): shear angle value in degrees between -180 to 180, clockwise direction.
+            If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while
+            the second value corresponds to a shear parallel to the y axis.
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to cv2.INTER_NEAREST.
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the affined image.
+            If int, it is used for all channels respectively.
+        center (sequence, optional): Optional center of rotation. Origin is the upper left corner.
+            Default is the center of the image.
+
+    Returns:
+        np.array: Affined image.
+
+    """
+    cv2 = try_import('cv2')
+    _cv2_interp_from_str = {
+        'nearest': cv2.INTER_NEAREST,
+        'bilinear': cv2.INTER_LINEAR,
+        'area': cv2.INTER_AREA,
+        'bicubic': cv2.INTER_CUBIC,
+        'lanczos': cv2.INTER_LANCZOS4
+    }
+
+    h, w = img.shape[0:2]
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    if center is None:
+        center = (w / 2.0, h / 2.0)
+
+    M = np.ones([2, 3])
+    # Rotate and Scale
+    R = cv2.getRotationMatrix2D(angle=angle, center=center, scale=scale)
+
+    # Shear
+    sx = math.tan(shear[0] * math.pi / 180)
+    sy = math.tan(shear[1] * math.pi / 180)
+    M[0] = R[0] + sy * R[1]
+    M[1] = R[1] + sx * R[0]
+
+    # Translation
+    tx, ty = translate
+    M[0, 2] = tx
+    M[1, 2] = ty
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.warpAffine(
+            img,
+            M,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)[:, :, np.newaxis]
+    else:
+        return cv2.warpAffine(
+            img,
+            M,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)
+
+
 def rotate(img,
            angle,
            interpolation='nearest',
@@ -509,6 +589,56 @@ def transform(x, y, matrix):
             borderValue=fill)
 
 
+def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
+    """Perspective the image.
+
+    Args:
+        img (np.array): Image to be perspectived.
+        startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+        endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to cv2.INTER_NEAREST.
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        np.array: Perspectived image.
+
+    """
+    cv2 = try_import('cv2')
+    _cv2_interp_from_str = {
+        'nearest': cv2.INTER_NEAREST,
+        'bilinear': cv2.INTER_LINEAR,
+        'area': cv2.INTER_AREA,
+        'bicubic': cv2.INTER_CUBIC,
+        'lanczos': cv2.INTER_LANCZOS4
+    }
+    h, w = img.shape[0:2]
+
+    startpoints = np.array(startpoints, dtype="float32")
+    endpoints = np.array(endpoints, dtype="float32")
+    matrix = cv2.getPerspectiveTransform(startpoints, endpoints)
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.warpPerspective(
+            img,
+            matrix,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)[:, :, np.newaxis]
+    else:
+        return cv2.warpPerspective(
+            img,
+            matrix,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index 71f7759f11b66..4b86e14039ebe 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -410,6 +410,32 @@ def adjust_hue(img, hue_factor):
     return img
 
 
+def affine(img, matrix, interpolation="nearest", fill=0):
+    """Affine the image by matrix.
+
+    Args:
+        img (PIL.Image): Image to be affined.
+        matrix (float or int): Affine matrix.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the affined image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        PIL.Image: Affined image.
+
+    """
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    return img.transform(img.size, Image.AFFINE, matrix,
+                         _pil_interp_from_str[interpolation], fill)
+
+
 def rotate(img,
            angle,
            interpolation="nearest",
@@ -453,6 +479,33 @@ def rotate(img,
         fillcolor=fill)
 
 
+def perspective(img, coeffs, interpolation="nearest", fill=0):
+    """Perspective the image.
+
+    Args:
+        img (PIL.Image): Image to be perspectived.
+        coeffs (list[float]): coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        PIL.Image: Perspectived image.
+
+    """
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    return img.transform(img.size, Image.PERSPECTIVE, coeffs,
+                         _pil_interp_from_str[interpolation], fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 2e276883cd376..27f83029babaa 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -28,8 +28,9 @@
 
 def _assert_image_tensor(img, data_format):
     if not isinstance(
-            img, paddle.Tensor) or img.ndim != 3 or not data_format.lower() in (
-                'chw', 'hwc'):
+            img, paddle.Tensor
+    ) or img.ndim < 3 or img.ndim > 4 or not data_format.lower() in ('chw',
+                                                                     'hwc'):
         raise RuntimeError(
             'not support [type={}, ndim={}, data_format={}] paddle image'.
             format(type(img), img.ndim, data_format))
@@ -226,8 +227,8 @@ def _affine_grid(theta, w, h, ow, oh):
 
 def _grid_transform(img, grid, mode, fill):
     if img.shape[0] > 1:
-        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2],
-                           grid.shape[3])
+        grid = grid.expand(
+            shape=[img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3]])
 
     if fill is not None:
         dummy = paddle.ones(
@@ -255,6 +256,51 @@ def _grid_transform(img, grid, mode, fill):
     return img
 
 
+def affine(img, matrix, interpolation="nearest", fill=None, data_format='CHW'):
+    """Affine to the image by matrix.
+
+    Args:
+        img (paddle.Tensor): Image to be rotated.
+        matrix (float or int): Affine matrix.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest" 
+            - "bilinear"
+            - "bicubic"
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor: Affined image.
+
+    """
+    ndim = len(img.shape)
+    if ndim == 3:
+        img = img.unsqueeze(0)
+
+    img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
+
+    matrix = paddle.to_tensor(matrix, place=img.place)
+    matrix = matrix.reshape((1, 2, 3))
+    shape = img.shape
+
+    grid = _affine_grid(
+        matrix, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    out = _grid_transform(img, grid, mode=interpolation, fill=fill)
+
+    out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+    out = out.squeeze(0) if ndim == 3 else out
+
+    return out
+
+
 def rotate(img,
            angle,
            interpolation='nearest',
@@ -354,6 +400,72 @@ def rotate(img,
     return out.squeeze(0)
 
 
+def _perspective_grid(img, coeffs, ow, oh, dtype):
+    theta1 = coeffs[:6].reshape([1, 2, 3])
+    tmp = paddle.tile(coeffs[6:].reshape([1, 2]), repeat_times=[2, 1])
+    dummy = paddle.ones((2, 1), dtype=dtype)
+    theta2 = paddle.concat((tmp, dummy), axis=1).unsqueeze(0)
+
+    d = 0.5
+    base_grid = paddle.ones((1, oh, ow, 3), dtype=dtype)
+
+    x_grid = paddle.linspace(d, ow * 1.0 + d - 1.0, ow)
+    base_grid[..., 0] = x_grid
+    y_grid = paddle.linspace(d, oh * 1.0 + d - 1.0, oh).unsqueeze_(-1)
+    base_grid[..., 1] = y_grid
+
+    scaled_theta1 = theta1.transpose(
+        (0, 2, 1)) / paddle.to_tensor([0.5 * ow, 0.5 * oh])
+    output_grid1 = base_grid.reshape((1, oh * ow, 3)).bmm(scaled_theta1)
+    output_grid2 = base_grid.reshape(
+        (1, oh * ow, 3)).bmm(theta2.transpose((0, 2, 1)))
+
+    output_grid = output_grid1 / output_grid2 - 1.0
+    return output_grid.reshape((1, oh, ow, 2))
+
+
+def perspective(img,
+                coeffs,
+                interpolation="nearest",
+                fill=None,
+                data_format='CHW'):
+    """Perspective the image.
+
+    Args:
+        img (paddle.Tensor): Image to be rotated.
+        coeffs (list[float]): coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set NEAREST. When use pil backend, 
+            support method are as following: 
+            - "nearest" 
+            - "bilinear"
+            - "bicubic"
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        paddle.Tensor: Perspectived image.
+
+    """
+
+    ndim = len(img.shape)
+    if ndim == 3:
+        img = img.unsqueeze(0)
+
+    img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
+    ow, oh = img.shape[-1], img.shape[-2]
+    dtype = img.dtype if paddle.is_floating_point(img) else paddle.float32
+
+    coeffs = paddle.to_tensor(coeffs, place=img.place)
+    grid = _perspective_grid(img, coeffs, ow=ow, oh=oh, dtype=dtype)
+    out = _grid_transform(img, grid, mode=interpolation, fill=fill)
+
+    out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+    out = out.squeeze(0) if ndim == 3 else out
+
+    return out
+
+
 def vflip(img, data_format='CHW'):
     """Vertically flips the given paddle tensor.
 
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 828a0d9b0936d..fea2efb1fb2b1 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -45,7 +45,14 @@ def _get_image_size(img):
     elif F._is_numpy_image(img):
         return img.shape[:2][::-1]
     elif F._is_tensor_image(img):
-        return img.shape[1:][::-1]  # chw
+        if len(img.shape) == 3:
+            return img.shape[1:][::-1]  # chw -> wh
+        elif len(img.shape) == 4:
+            return img.shape[2:][::-1]  # nchw -> wh
+        else:
+            raise ValueError(
+                "The dim for input Tensor should be 3-D or 4-D, but received {}".
+                format(len(img.shape)))
     else:
         raise TypeError("Unexpected type {}".format(type(img)))
 
@@ -1205,6 +1212,189 @@ def _apply_image(self, img):
         return F.pad(img, self.padding, self.fill, self.padding_mode)
 
 
+def _check_sequence_input(x, name, req_sizes):
+    msg = req_sizes[0] if len(req_sizes) < 2 else " or ".join(
+        [str(s) for s in req_sizes])
+    if not isinstance(x, Sequence):
+        raise TypeError(f"{name} should be a sequence of length {msg}.")
+    if len(x) not in req_sizes:
+        raise ValueError(f"{name} should be sequence of length {msg}.")
+
+
+def _setup_angle(x, name, req_sizes=(2, )):
+    if isinstance(x, numbers.Number):
+        if x < 0:
+            raise ValueError(
+                f"If {name} is a single number, it must be positive.")
+        x = [-x, x]
+    else:
+        _check_sequence_input(x, name, req_sizes)
+
+    return [float(d) for d in x]
+
+
+class RandomAffine(BaseTransform):
+    """Random affine transformation of the image.
+
+    Args:
+        degrees (int|float|tuple): The angle interval of the random rotation.
+            If set as a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees) in clockwise order. If set 0, will not rotate.
+        translate (tuple, optional): Maximum absolute fraction for horizontal and vertical translations.
+            For example translate=(a, b), then horizontal shift is randomly sampled in the range -img_width * a < dx < img_width * a
+            and vertical shift is randomly sampled in the range -img_height * b < dy < img_height * b. 
+            Default is None, will not translate.
+        scale (tuple, optional): Scaling factor interval, e.g (a, b), then scale is randomly sampled from the range a <= scale <= b. 
+            Default is None, will keep original scale and not scale.
+        shear (sequence or number, optional): Range of degrees to shear, ranges from -180 to 180 in clockwise order.
+            If set as a number, a shear parallel to the x axis in the range (-shear, +shear) will be applied. 
+            Else if set as a sequence of 2 values a shear parallel to the x axis in the range (shear[0], shear[1]) will be applied. 
+            Else if set as a sequence of 4 values, a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Default is None, will not apply shear.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        center (2-tuple, optional): Optional center of rotation, (x, y).
+            Origin is the upper left corner.
+            Default is the center of the image.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
+    Shape:
+        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
+        - output(PIL.Image|np.ndarray|Paddle.Tensor): An affined image.
+
+    Returns:
+        A callable object of RandomAffine.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import RandomAffine
+
+            transform = RandomAffine([-90, 90], translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[-10, 10])
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self,
+                 degrees,
+                 translate=None,
+                 scale=None,
+                 shear=None,
+                 interpolation='nearest',
+                 fill=0,
+                 center=None,
+                 keys=None):
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2, ))
+
+        super(RandomAffine, self).__init__(keys)
+        assert interpolation in ['nearest', 'bilinear', 'bicubic']
+        self.interpolation = interpolation
+
+        if translate is not None:
+            _check_sequence_input(translate, "translate", req_sizes=(2, ))
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError(
+                        "translation values should be between 0 and 1")
+        self.translate = translate
+
+        if scale is not None:
+            _check_sequence_input(scale, "scale", req_sizes=(2, ))
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+
+        if shear is not None:
+            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
+        else:
+            self.shear = shear
+
+        if fill is None:
+            fill = 0
+        elif not isinstance(fill, (Sequence, numbers.Number)):
+            raise TypeError("Fill should be either a sequence or a number.")
+        self.fill = fill
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2, ))
+        self.center = center
+
+    def _get_param(self,
+                   img_size,
+                   degrees,
+                   translate=None,
+                   scale_ranges=None,
+                   shears=None):
+        """Get parameters for affine transformation
+
+        Returns:
+            params to be passed to the affine transformation
+        """
+        angle = random.uniform(degrees[0], degrees[1])
+
+        if translate is not None:
+            max_dx = float(translate[0] * img_size[0])
+            max_dy = float(translate[1] * img_size[1])
+            tx = int(random.uniform(-max_dx, max_dx))
+            ty = int(random.uniform(-max_dy, max_dy))
+            translations = (tx, ty)
+        else:
+            translations = (0, 0)
+
+        if scale_ranges is not None:
+            scale = random.uniform(scale_ranges[0], scale_ranges[1])
+        else:
+            scale = 1.0
+
+        shear_x, shear_y = 0.0, 0.0
+        if shears is not None:
+            shear_x = random.uniform(shears[0], shears[1])
+            if len(shears) == 4:
+                shear_y = random.uniform(shears[2], shears[3])
+        shear = (shear_x, shear_y)
+
+        return angle, translations, scale, shear
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL.Image|np.array): Image to be affine transformed.
+
+        Returns:
+            PIL.Image or np.array: Affine transformed image.
+        """
+
+        w, h = _get_image_size(img)
+        img_size = [w, h]
+
+        ret = self._get_param(img_size, self.degrees, self.translate,
+                              self.scale, self.shear)
+
+        return F.affine(
+            img,
+            *ret,
+            interpolation=self.interpolation,
+            fill=self.fill,
+            center=self.center)
+
+
 class RandomRotation(BaseTransform):
     """Rotates the image by angle.
 
@@ -1298,6 +1488,125 @@ def _apply_image(self, img):
                         self.center, self.fill)
 
 
+class RandomPerspective(BaseTransform):
+    """Random perspective transformation with a given probability.
+
+    Args:
+        prob (float, optional): Probability of using transformation, ranges from
+            0 to 1, default is 0.5.
+        distortion_scale (float, optional): Degree of distortion, ranges from
+            0 to 1, default is 0.5.
+        interpolation (str, optional): Interpolation method. If omitted, or if
+            the image has only one channel, it is set to PIL.Image.NEAREST or
+            cv2.INTER_NEAREST.
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
+    Shape:
+        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
+        - output(PIL.Image|np.ndarray|Paddle.Tensor): A perspectived image.
+
+    Returns:
+        A callable object of RandomPerspective.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import RandomPerspective
+
+            transform = RandomPerspective(prob=1.0, distortion_scale=0.9)
+
+            fake_img = paddle.randn((3, 200, 150)).astype(paddle.float32)
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 distortion_scale=0.5,
+                 interpolation='nearest',
+                 fill=0,
+                 keys=None):
+        super(RandomPerspective, self).__init__(keys)
+        assert 0 <= prob <= 1, "probability must be between 0 and 1"
+        assert 0 <= distortion_scale <= 1, "distortion_scale must be between 0 and 1"
+        assert interpolation in ['nearest', 'bilinear', 'bicubic']
+        assert isinstance(fill, (numbers.Number, str, list, tuple))
+
+        self.prob = prob
+        self.distortion_scale = distortion_scale
+        self.interpolation = interpolation
+        self.fill = fill
+
+    def get_params(self, width, height, distortion_scale):
+        """
+        Returns:
+            startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+            endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+        """
+        half_height = height // 2
+        half_width = width // 2
+        topleft = [
+            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
+            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
+        ]
+        topright = [
+            int(
+                random.uniform(width - int(distortion_scale * half_width) - 1,
+                               width)),
+            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
+        ]
+        botright = [
+            int(
+                random.uniform(width - int(distortion_scale * half_width) - 1,
+                               width)),
+            int(
+                random.uniform(height - int(distortion_scale * half_height) - 1,
+                               height)),
+        ]
+        botleft = [
+            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
+            int(
+                random.uniform(height - int(distortion_scale * half_height) - 1,
+                               height)),
+        ]
+        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1],
+                       [0, height - 1]]
+        endpoints = [topleft, topright, botright, botleft]
+
+        return startpoints, endpoints
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL.Image|np.array|paddle.Tensor): Image to be Perspectively transformed.
+
+        Returns:
+            PIL.Image|np.array|paddle.Tensor: Perspectively transformed image.
+        """
+
+        width, height = _get_image_size(img)
+
+        if random.random() < self.prob:
+            startpoints, endpoints = self.get_params(width, height,
+                                                     self.distortion_scale)
+            return F.perspective(img, startpoints, endpoints,
+                                 self.interpolation, self.fill)
+        return img
+
+
 class Grayscale(BaseTransform):
     """Converts image to grayscale.
 
@@ -1377,7 +1686,9 @@ class RandomErasing(BaseTransform):
             
             fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
             transform = paddle.vision.transforms.RandomErasing()
-            result = transform(fake_img) 
+            result = transform(fake_img)
+
+            print(result)
     """
 
     def __init__(self,
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index b0800a9cd845e..8420590399549 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -391,6 +391,22 @@ if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     fi
 fi
 
+if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    ERROR_LINES=""
+    for TEST_FILE in ${UNITTEST_FILE_CHANGED};
+    do
+        ENABLE_LEGACY_DYGRAPH_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "_enable_legacy_dygraph" || true`
+        if [ "${ENABLE_LEGACY_DYGRAPH_CI}" != "" ]; then
+            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${ENABLE_LEGACY_DYGRAPH_CI}\n"
+        fi
+    done
+    if [ "${ERROR_LINES}" != "" ]; then
+        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
+        echo_line="_enable_legacy_dygraph forces the mode to old dynamic graph. You must have one RD (pangyoki (Recommend), Aurelius84 or JiabinYang) approval for the usage (either add or delete) of _enable_legacy_dygraph. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Enable-Eager-Mode-in-Paddle-CI. The corresponding lines are as follows:\n${ERROR_LINES}\n"
+        check_approval 1 26408901 9301846 22361972
+    fi
+fi
+
 RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|grep -E "CMakeLists.txt"||true`
 if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 08536ae401fe1..ee2d984035624 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -6,7 +6,7 @@
 # run a container
 # docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
-FROM graphcore/poplar:2.3.0
+FROM graphcore/poplar-extbaidu:2.5.0-ubuntu-18.04-20220407
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables
@@ -25,6 +25,7 @@ RUN apt-get update && apt-get install -y curl wget vim git unzip unrar tar xz-ut
             bison graphviz libjpeg-dev zlib1g zlib1g-dev automake locales swig net-tools libtool module-init-tools numactl libnuma-dev \
             openssl libffi-dev pciutils libblas-dev gfortran libblas3 liblapack-dev liblapack3 default-jre screen tmux gdb lldb gcc g++
 RUN apt-get update && apt-get install -y rdma-core librdmacm1
+RUN apt-get update && apt-get install libspdlog-dev
 
 # Downgrade gcc&&g++
 WORKDIR /usr/bin 
diff --git a/tools/get_build_time.sh b/tools/get_build_time.sh
index a89c024f97ea2..496c8c12d6ca3 100755
--- a/tools/get_build_time.sh
+++ b/tools/get_build_time.sh
@@ -14,5 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-CUR_ROOT=$(dirname "$0")/..
-/usr/bin/time -f '%C, %E elapsed, %U user, %S sys' "$@" >> $CUR_ROOT/build/build-time 2>&1
+CMAKE_BINARY_DIR=$1
+shift
+start=$(date +%s.%N)
+duration=$("/usr/bin/time" -f "%C, %E elapsed, %U user, %S sys" "$@" 2>&1)
+end=$(date +%s.%N)
+
+echo ${duration}, 'start', $start, 'end', $end, 'process', $$ >> $CMAKE_BINARY_DIR/build-time
diff --git a/tools/get_ut_mem_map.py b/tools/get_ut_mem_map.py
index daf80597d3ad0..745d7f9a90c24 100644
--- a/tools/get_ut_mem_map.py
+++ b/tools/get_ut_mem_map.py
@@ -34,8 +34,8 @@ def get_ut_mem(rootPath):
                 if '[Memory Usage (Byte)] gpu' in line:
                     mem_reserved = round(
                         float(
-                            line.split('[max memory reserved] gpu')[1].split(
-                                ':')[1].split('\\n')[0].strip()), 2)
+                            line.split(' : Reserved = ')[1].split(
+                                ', Allocated = ')[0]), 2)
                     if mem_reserved > mem_reserved1:
                         mem_reserved1 = mem_reserved
                 if 'MAX_GPU_MEMORY_USE=' in line:
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index aaa667595f94c..6067b40f0a7c1 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -170,6 +170,7 @@
     'test_elementwise_div_op',
     'test_elementwise_floordiv_op',
     'test_elementwise_gradient_op',
+    'test_elementwise_heaviside_op',
     'test_elementwise_max_op',
     'test_elementwise_min_op',
     'test_elementwise_mod_op',
@@ -654,10 +655,12 @@
     'test_transpose_mkldnn_op',
     'test_mkldnn_conv_activation_fuse_pass',
     'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
+    'test_mkldnn_int8_scale_calculation_pass',
     'test_mkldnn_matmul_op_output_fuse_pass',
     'test_mkldnn_matmul_transpose_reshape_fuse_pass',
     'test_mkldnn_scale_matmul_fuse_pass',
     'test_mkldnn_inplace_fuse_pass',
+    'test_mkldnn_conv_affine_channel_fuse_pass',
     'test_batch_fc_op',
     'test_c_comm_init_all_op',
     'test_conv2d_fusion_op',
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 7ceed18634a87..02d926914f904 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -32,6 +32,7 @@ def main():
     if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
         if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None):
             os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
+            os.environ['FLAGS_enable_gpu_memory_usage_log_mb'] = 'false'
 
     some_test_failed = False
     for module_name in sys.argv[1:]: