From 0b60f28c9e47cad858d95769c0007642052dbe08 Mon Sep 17 00:00:00 2001
From: engineer1109 <1292846099@qq.com>
Date: Mon, 3 Apr 2023 17:45:44 +0800
Subject: [PATCH] remove WITH_ASCEND_CL PADDLE_WITH_ASCEND_CL WITH_ASCEND_CXX11
 (#52448)

---
 CMakeLists.txt                                |  28 -
 cmake/configure.cmake                         |   4 -
 cmake/external/ascend.cmake                   | 108 ---
 cmake/external/gloo.cmake                     |  56 +-
 cmake/external/protobuf.cmake                 |   7 +-
 cmake/external/threadpool.cmake               |   6 +-
 cmake/external/warpctc.cmake                  | 139 +--
 cmake/flags.cmake                             |   4 -
 cmake/inference_lib.cmake                     |  16 +-
 cmake/operators.cmake                         |  30 -
 cmake/third_party.cmake                       |  10 -
 paddle/fluid/framework/details/CMakeLists.txt |  15 +-
 .../fluid/framework/details/nan_inf_utils.h   |   6 -
 .../framework/details/nan_inf_utils_detail.cc | 176 ----
 paddle/fluid/framework/device_worker.h        |   3 +-
 .../fluid/framework/device_worker_factory.cc  |   3 +-
 paddle/fluid/framework/executor.cc            |  17 -
 paddle/fluid/framework/fleet/CMakeLists.txt   |   7 -
 .../fluid/framework/fleet/ascend_wrapper.cc   |  22 -
 paddle/fluid/framework/fleet/ascend_wrapper.h | 214 ----
 paddle/fluid/framework/garbage_collector.cc   |  26 -
 paddle/fluid/framework/garbage_collector.h    |  22 -
 .../interpreter/execution_config.cc           |   5 -
 .../interpreter/interpreter_util.cc           |  10 -
 .../framework/new_executor/interpretercore.cc |  27 -
 paddle/fluid/framework/operator.cc            |  21 -
 paddle/fluid/framework/parallel_executor.cc   |  14 -
 paddle/fluid/framework/phi_utils.cc           |   9 -
 paddle/fluid/framework/pipeline_trainer.cc    |   5 +-
 paddle/fluid/framework/section_worker.cc      |  15 +-
 paddle/fluid/framework/tensor_test.cc         |  66 --
 paddle/fluid/framework/tensor_util.cc         | 158 +--
 paddle/fluid/framework/tensor_util.h          | 116 ---
 paddle/fluid/framework/tensor_util_test.cc    |  26 -
 paddle/fluid/framework/trainer.h              |   3 +-
 paddle/fluid/framework/trainer_factory.cc     |   3 +-
 paddle/fluid/framework/type_defs.h            |  22 -
 paddle/fluid/framework/var_type_traits.h      |  12 -
 .../ir_params_sync_among_devices_pass.cc      |  48 -
 .../ir_params_sync_among_devices_pass.h       |   4 -
 paddle/fluid/inference/api/analysis_config.cc |  29 -
 .../fluid/inference/api/analysis_predictor.cc |   8 -
 paddle/fluid/inference/api/api_impl.cc        |  17 -
 paddle/fluid/inference/api/api_impl_tester.cc |   9 -
 .../inference/api/details/zero_copy_tensor.cc |  38 -
 .../api/details/zero_copy_tensor_test.cc      |   4 -
 .../inference/api/paddle_analysis_config.h    |   6 -
 paddle/fluid/inference/capi_exp/pd_config.cc  |   5 -
 paddle/fluid/inference/capi_exp/pd_config.h   |   8 -
 paddle/fluid/inference/goapi/config.go        |   9 -
 paddle/fluid/memory/allocation/CMakeLists.txt |   5 -
 .../memory/allocation/allocator_facade.cc     |  35 +-
 .../memory/allocation/allocator_facade.h      |   7 -
 .../memory/allocation/buddy_allocator.cc      |   9 +-
 .../memory/allocation/buddy_allocator_test.cc |  31 +-
 .../allocation/naive_best_fit_allocator.cc    | 204 ----
 .../naive_best_fit_allocator_test.cc          |  16 -
 .../fluid/memory/allocation/npu_allocator.cc  |  80 --
 .../fluid/memory/allocation/npu_allocator.h   |  42 -
 .../memory/allocation/npu_pinned_allocator.cc |  99 --
 .../memory/allocation/npu_pinned_allocator.h  |  51 -
 .../memory/allocation/system_allocator.cc     | 129 ---
 .../memory/allocation/system_allocator.h      |  26 -
 .../allocation/system_allocator_test.cc       |   8 -
 paddle/fluid/memory/memcpy.cc                 | 424 +-------
 paddle/fluid/operators/coalesce_tensor_op.cc  |   3 +-
 .../fluid/operators/copy_cross_scope_test.cc  |  12 -
 .../fluid/operators/detection/CMakeLists.txt  |  17 +-
 paddle/fluid/operators/expand_op.h            |   7 -
 paddle/fluid/operators/expand_v2_op.h         |  14 -
 paddle/fluid/operators/math/CMakeLists.txt    |  11 +-
 paddle/fluid/operators/memcpy_d2h_op.cc       |  28 -
 paddle/fluid/operators/norm_op.cc             |   4 -
 paddle/fluid/platform/device/device_wrapper.h |   3 -
 paddle/fluid/platform/device_context.cc       |  25 -
 paddle/fluid/platform/device_context.h        |  98 --
 paddle/fluid/platform/device_event.h          |   6 -
 paddle/fluid/platform/device_event_npu.cc     | 116 ---
 .../fluid/platform/dynload/dynamic_loader.cc  |   1 -
 .../fluid/platform/dynload/dynamic_loader.h   |   1 -
 paddle/fluid/platform/gen_comm_id_helper.cc   |   5 +-
 paddle/fluid/platform/gen_comm_id_helper.h    |   5 +-
 paddle/fluid/platform/init.cc                 |  11 -
 paddle/fluid/pybind/ascend_wrapper_py.cc      | 917 ------------------
 paddle/fluid/pybind/ascend_wrapper_py.h       |  32 -
 paddle/fluid/pybind/imperative.cc             |  13 -
 paddle/fluid/pybind/inference_api.cc          |   9 +-
 paddle/fluid/pybind/parallel_executor.cc      |   4 -
 paddle/fluid/pybind/place.cc                  |  49 +-
 paddle/fluid/pybind/pybind.cc                 |  49 +-
 paddle/fluid/pybind/tensor.cc                 |   4 -
 paddle/fluid/pybind/tensor_py.h               |  62 --
 paddle/phi/backends/device_memory_aligment.h  |   6 +-
 paddle/phi/backends/dynload/CMakeLists.txt    |   5 -
 paddle/phi/backends/dynload/dynamic_loader.cc |  18 -
 paddle/phi/backends/dynload/dynamic_loader.h  |   1 -
 paddle/phi/backends/npu/npu_info.h            |  36 -
 paddle/phi/core/flags.cc                      |  42 +-
 paddle/phi/core/utils/visit_place.h           |  20 -
 .../phi/kernels/funcs/interpolate_function.h  |   7 -
 test/CMakeLists.txt                           |  56 +-
 test/amp/CMakeLists.txt                       |  56 +-
 test/asp/CMakeLists.txt                       |   5 +-
 103 files changed, 140 insertions(+), 4400 deletions(-)
 delete mode 100644 paddle/fluid/framework/fleet/ascend_wrapper.cc
 delete mode 100644 paddle/fluid/framework/fleet/ascend_wrapper.h
 delete mode 100644 paddle/fluid/memory/allocation/npu_allocator.cc
 delete mode 100644 paddle/fluid/memory/allocation/npu_allocator.h
 delete mode 100644 paddle/fluid/memory/allocation/npu_pinned_allocator.cc
 delete mode 100644 paddle/fluid/memory/allocation/npu_pinned_allocator.h
 delete mode 100644 paddle/fluid/platform/device_event_npu.cc
 delete mode 100644 paddle/fluid/pybind/ascend_wrapper_py.cc
 delete mode 100644 paddle/fluid/pybind/ascend_wrapper_py.h
 delete mode 100644 paddle/phi/backends/npu/npu_info.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe10d96261ace..6ab6cbf54ac86 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,10 +58,6 @@ option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
 option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
 option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
 option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
-# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
-# to develop some acl related functionality on x86
-option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
-option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
 option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
 option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
@@ -113,14 +109,6 @@ if(APPLE AND WITH_ARM)
   set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
 endif()
 
-if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-  if(WITH_ARM_BRPC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
-  else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-  endif()
-endif()
-
 if(WIN32)
   option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
 
@@ -525,15 +513,6 @@ if(WITH_DISTRIBUTE)
         ON
         CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
   endif()
-  if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
-    # disable WITH_PSCORE for NPU before include third_party
-    message(
-      WARNING
-        "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
-    set(WITH_PSCORE
-        OFF
-        CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
-  endif()
   if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496)
     # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
     message(
@@ -567,13 +546,6 @@ if(WITH_RPC)
         OFF
         CACHE BOOL "Disable WITH_RPC when not compiled with distribute" FORCE)
   endif()
-  if(WITH_ASCEND_CL AND WITH_RPC)
-    message(
-      WARNING "Disable WITH_RPC when compiling with NPU. Force WITH_RPC=OFF.")
-    set(WITH_RPC
-        OFF
-        CACHE BOOL "Disable WITH_RPC when compiling with NPU" FORCE)
-  endif()
   if(WITH_ROCM AND WITH_RPC)
     message(
       WARNING "Disable WITH_RPC when compiling with ROCM. Force WITH_RPC=OFF.")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 5147e54ea71fc..71e42632b2bd6 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -97,10 +97,6 @@ if(WITH_ASCEND)
   add_definitions(-DPADDLE_WITH_ASCEND)
 endif()
 
-if(WITH_ASCEND_CL)
-  add_definitions(-DPADDLE_WITH_ASCEND_CL)
-endif()
-
 if(WITH_ASCEND_INT64)
   add_definitions(-DPADDLE_WITH_ASCEND_INT64)
 endif()
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index 3dbe7e6e8aa90..cbddf9496c24f 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -25,111 +25,3 @@ if(EXISTS
   # It means CANN 20.2 +
   add_definitions(-DPADDLE_WITH_ASCEND_STRING)
 endif()
-
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
-  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
-  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
-  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
-  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
-  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
-  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
-
-  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR}
-                             ${ASCEND_ATC_DIR})
-  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
-  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-  set(ATLAS_RUNTIME_INC_DIR
-      ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
-  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
-  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
-  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR}
-                            ${ATLAS_ATC_DIR})
-
-  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
-  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
-  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
-  include_directories(${ATLAS_RUNTIME_INC_DIR})
-
-  add_library(ascend_ge SHARED IMPORTED GLOBAL)
-  set_property(TARGET ascend_ge PROPERTY IMPORTED_LOCATION
-                                         ${atlas_ge_runner_lib})
-
-  add_library(ascend_graph SHARED IMPORTED GLOBAL)
-  set_property(TARGET ascend_graph PROPERTY IMPORTED_LOCATION
-                                            ${atlas_graph_lib})
-
-  add_library(atlas_acl SHARED IMPORTED GLOBAL)
-  set_property(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
-
-  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
-endif()
-
-if(WITH_ASCEND_CL)
-  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
-
-  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
-  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
-  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
-  set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
-  set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
-
-  message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
-  message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
-  include_directories(${FWKACLLIB_INC_DIR})
-  include_directories(${ACLLIB_INC_DIR})
-
-  add_library(ascendcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
-
-  add_library(ascend_hccl SHARED IMPORTED GLOBAL)
-  set_property(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
-
-  add_library(acl_op_compiler SHARED IMPORTED GLOBAL)
-  set_property(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION
-                                               ${acl_op_compiler_lib})
-  add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
-endif()
-
-if(WITH_ASCEND_CL)
-  macro(find_ascend_toolkit_version ascend_toolkit_version_info)
-    file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
-    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)"
-                 ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
-    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1"
-                         ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
-    string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION
-                         ${ASCEND_TOOLKIT_VERSION})
-    string(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION)
-    add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
-    if(NOT ASCEND_TOOLKIT_VERSION)
-      set(ASCEND_TOOLKIT_VERSION "???")
-    else()
-      message(
-        STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}")
-    endif()
-  endmacro()
-
-  macro(find_ascend_driver_version ascend_driver_version_info)
-    file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS)
-    string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION
-                 "${ASCEND_DRIVER_VERSION_CONTENTS}")
-    string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1"
-                         ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}")
-    if(NOT ASCEND_DRIVER_VERSION)
-      set(ASCEND_DRIVER_VERSION "???")
-    else()
-      message(
-        STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}")
-    endif()
-  endmacro()
-
-  if(WITH_ARM)
-    set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux)
-  else()
-    set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux)
-  endif()
-
-  find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info)
-  find_ascend_driver_version(${ASCEND_DIR}/driver/version.info)
-endif()
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index 574d673b88784..0666d48538b74 100755
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -61,44 +61,24 @@ if(CMAKE_COMPILER_IS_GNUCC)
 endif()
 include_directories(${GLOO_INCLUDE_DIR})
 
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  ExternalProject_Add(
-    ${GLOO_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-    GIT_REPOSITORY ${GLOO_REPOSITORY}
-    GIT_TAG ${GLOO_TAG}
-    PREFIX "${GLOO_PREFIX_DIR}"
-    UPDATE_COMMAND ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND
-      mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake
-      .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . &&
-      mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
-                    ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-    COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
-            "${GLOO_INCLUDE_DIR}/gloo"
-    BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
-else()
-  ExternalProject_Add(
-    ${GLOO_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-    GIT_REPOSITORY ${GLOO_REPOSITORY}
-    GIT_TAG ${GLOO_TAG}
-    PREFIX "${GLOO_PREFIX_DIR}"
-    UPDATE_COMMAND ""
-    PATCH_COMMAND ${GLOO_PATCH_COMMAND}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND
-      mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake
-      .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . &&
-      mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
-                    ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-    COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
-            "${GLOO_INCLUDE_DIR}/gloo"
-    BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
-endif()
+ExternalProject_Add(
+  ${GLOO_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${GLOO_REPOSITORY}
+  GIT_TAG ${GLOO_TAG}
+  PREFIX "${GLOO_PREFIX_DIR}"
+  UPDATE_COMMAND ""
+  PATCH_COMMAND ${GLOO_PATCH_COMMAND}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND
+    mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake ..
+    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . && mkdir
+    -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/glo
+  INSTALL_COMMAND ${CMAKE_COMMAND} -E copy
+                  ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+  COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/"
+          "${GLOO_INCLUDE_DIR}/gloo"
+  BUILD_BYPRODUCTS ${GLOO_LIBRARIES})
 
 add_library(gloo STATIC IMPORTED GLOBAL)
 set_property(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index af3a2c5d84460..7e81c0ab4b856 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -237,9 +237,6 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
   if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
     set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
     set(PROTOBUF_TAG v21.12)
-  elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-    set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
-    set(PROTOBUF_TAG v21.12)
   elseif(WITH_IPU)
     set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
     set(PROTOBUF_TAG v21.12)
@@ -325,9 +322,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
   endif()
 endfunction()
 
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  set(PROTOBUF_VERSION 21.12)
-elseif(WITH_IPU)
+if(WITH_IPU)
   set(PROTOBUF_VERSION 21.12)
 elseif(WITH_ARM_BRPC)
   set(PROTOBUF_VERSION 21.12-baidu-ee-common)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 1047465095f42..afeacdc833906 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -15,11 +15,7 @@
 include(ExternalProject)
 
 set(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  set(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
-else()
-  set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
-endif()
+set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
 set(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
 
 set(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7f8da7fbe506b..e1e7234da0e25 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -64,96 +64,59 @@ else()
   set(USE_OMP ON)
 endif()
 
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  ExternalProject_Add(
-    extern_warpctc
-    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
-    GIT_TAG ${WARPCTC_TAG}
-    PREFIX ${WARPCTC_PREFIX_DIR}
-    #UPDATE_COMMAND  ""
-    PATCH_COMMAND ""
-    BUILD_ALWAYS 1
-    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-               -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-               -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-               -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-               -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-               -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-               -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-               -DWITH_GPU=${WITH_GPU}
-               -DWITH_ROCM=${WITH_ROCM}
-               -DWITH_OMP=${USE_OMP}
-               -DWITH_TORCH=OFF
-               -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-               -DBUILD_SHARED=ON
-               -DBUILD_TESTS=OFF
-               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-               ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS
-      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-    BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
+if(WIN32)
+  set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_RELEASE
+      $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_RELEASE
+      $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_DEBUG
+      $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
 else()
-  if(WIN32)
-    set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
-    set(WARPCTC_C_FLAGS_DEBUG
-        $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
-    set(WARPCTC_C_FLAGS_RELEASE
-        $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
-    set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
-    set(WARPCTC_CXX_FLAGS_RELEASE
-        $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
-    set(WARPCTC_CXX_FLAGS_DEBUG
-        $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
-  else()
-    set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
-    set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
-    set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-    set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-    set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
-    set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
-  endif()
-  ExternalProject_Add(
-    extern_warpctc
-    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
-    GIT_TAG ${WARPCTC_TAG}
-    PREFIX ${WARPCTC_PREFIX_DIR}
-    UPDATE_COMMAND ""
-    PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
-    #BUILD_ALWAYS    1
-    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-               -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
-               -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
-               -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
-               -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
-               -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
-               -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
-               -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-               -DWITH_GPU=${WITH_GPU}
-               -DWITH_ROCM=${WITH_ROCM}
-               -DWITH_OMP=${USE_OMP}
-               -DWITH_TORCH=OFF
-               -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-               -DBUILD_SHARED=ON
-               -DBUILD_TESTS=OFF
-               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-               -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
-               ${EXTERNAL_OPTIONAL_ARGS}
-               ${WARPCTC_CCBIN_OPTION}
-    CMAKE_CACHE_ARGS
-      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-    BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
+  set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+  set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+  set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
+ExternalProject_Add(
+  extern_warpctc
+  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+  GIT_REPOSITORY ${WARPCTC_REPOSITORY}
+  GIT_TAG ${WARPCTC_TAG}
+  PREFIX ${WARPCTC_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND ${WARPCTC_PATCH_COMMAND}
+  #BUILD_ALWAYS    1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+             -DWITH_GPU=${WITH_GPU}
+             -DWITH_ROCM=${WITH_ROCM}
+             -DWITH_OMP=${USE_OMP}
+             -DWITH_TORCH=OFF
+             -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+             -DBUILD_SHARED=ON
+             -DBUILD_TESTS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+             ${EXTERNAL_OPTIONAL_ARGS}
+             ${WARPCTC_CCBIN_OPTION}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
 
 message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index b880c8028a4f6..5363b1758720d 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -167,10 +167,6 @@ if(NOT WIN32)
     set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare -Wno-non-virtual-dtor)
   endif()
 
-  if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
-    set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
-  endif()
-
   if(NOT APPLE)
     if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
       set(COMMON_FLAGS
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 505cfd1cab4f1..f5fc9b8b9cf8f 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -508,14 +508,9 @@ function(version version_file)
     OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
   file(
     WRITE ${version_file}
-    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-    "WITH_MKL: ${WITH_MKL}\n"
-    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-    "WITH_GPU: ${WITH_GPU}\n"
-    "WITH_ROCM: ${WITH_ROCM}\n"
-    "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n"
-    "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n"
-    "WITH_IPU: ${WITH_IPU}\n")
+    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "WITH_MKL: ${WITH_MKL}\n"
+    "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n"
+    "WITH_ROCM: ${WITH_ROCM}\n" "WITH_IPU: ${WITH_IPU}\n")
   if(WITH_GPU)
     file(APPEND ${version_file}
          "CUDA version: ${CUDA_VERSION}\n"
@@ -526,11 +521,6 @@ function(version version_file)
          "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"
          "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
   endif()
-  if(WITH_ASCEND_CL)
-    file(APPEND ${version_file}
-         "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n"
-         "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n")
-  endif()
   if(WITH_IPU)
     file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n")
   endif()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 42c7cc5862a9f..34b4536e4e279 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -74,9 +74,6 @@ function(op_library TARGET)
   set(MKLDNN_FILE)
   set(op_common_deps operator op_registry math_function layer
                      common_infer_shape_functions)
-  if(WITH_ASCEND_CL)
-    set(op_common_deps ${op_common_deps} npu_op_runner)
-  endif()
   if(WITH_MLU)
     set(op_common_deps ${op_common_deps} mlu_baseop)
   endif()
@@ -175,12 +172,6 @@ function(op_library TARGET)
         list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
       endif()
     endif()
-    if(WITH_ASCEND_CL)
-      string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
-        list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
-      endif()
-    endif()
     if(WITH_MLU)
       string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
       if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
@@ -213,8 +204,6 @@ function(op_library TARGET)
         list(APPEND xpu_kp_cc_srcs ${src})
       elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
         list(APPEND xpu_kp_cc_srcs ${src})
-      elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
-        list(APPEND npu_cc_srcs ${src})
       elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
         list(APPEND mlu_cc_srcs ${src})
       elseif(${src} MATCHES ".*\\.cc$")
@@ -331,13 +320,6 @@ function(op_library TARGET)
       SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs}
       DEPS ${op_library_DEPS} ${op_common_deps})
   else()
-    # deal with CANN version control while registering NPU operators before build
-    if(WITH_ASCEND_CL)
-      if(CANN_VERSION LESS 504000)
-        list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc")
-        list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc")
-      endif()
-    endif()
     # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
     if(WITH_UNITY_BUILD AND op_library_UNITY)
       # Combine the cc source files.
@@ -541,18 +523,6 @@ function(op_library TARGET)
     endforeach()
   endif()
 
-  # pybind USE_OP_DEVICE_KERNEL for NPU
-  if(WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
-    foreach(npu_src ${npu_cc_srcs})
-      set(op_name "")
-      find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name)
-      if(NOT ${op_name} EQUAL "")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n")
-        set(pybind_flag 1)
-      endif()
-    endforeach()
-  endif()
-
   # pybind USE_OP_DEVICE_KERNEL for MLU
   if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
     foreach(mlu_src ${mlu_cc_srcs})
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 59e31b7c9aafa..42474cb801f11 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -394,16 +394,6 @@ if(WITH_BOX_PS)
   list(APPEND third_party_deps extern_box_ps)
 endif()
 
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  include(external/ascend)
-  if(WITH_ASCEND OR WITH_ASCEND_CL)
-    list(APPEND third_party_deps extern_ascend)
-  endif()
-  if(WITH_ASCEND_CL)
-    list(APPEND third_party_deps extern_ascend_cl)
-  endif()
-endif()
-
 if(WITH_PSCORE)
   include(external/snappy)
   list(APPEND third_party_deps extern_snappy)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index b13cb45bf988f..820846cacca6b 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -205,17 +205,10 @@ elseif(WITH_ROCM)
     SRCS fused_broadcast_op_handle.cc
     DEPS broadcast_op_handle)
 else()
-  if(WITH_ASCEND_CL)
-    cc_library(
-      nan_inf_utils
-      SRCS nan_inf_utils_detail.cc
-      DEPS npu_op_runner framework_proto scope place)
-  else()
-    cc_library(
-      nan_inf_utils
-      SRCS nan_inf_utils_detail.cc
-      DEPS framework_proto scope place)
-  endif()
+  cc_library(
+    nan_inf_utils
+    SRCS nan_inf_utils_detail.cc
+    DEPS framework_proto scope place)
   cc_library(
     all_reduce_op_handle
     SRCS all_reduce_op_handle.cc
diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h
index ef2a7d8f0f1e0..ec2c1a45d0fc0 100644
--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
@@ -54,12 +54,6 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
   }
 }
 
-#ifdef PADDLE_WITH_ASCEND_CL
-void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
-                                 const framework::Scope& scope,
-                                 const platform::Place& place);
-#endif
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 59a40ea1f38ab..e3e08e8b7df28 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -19,8 +19,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/common/amp_type_traits.h"
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
@@ -243,40 +241,6 @@ void CheckVarHasNanOrInf(const std::string& op_type,
         "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
         "with XPU.",
         var_name));
-#endif
-    return;
-  } else if (platform::is_npu_place(tensor->place())) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (framework::TransToProtoVarType(tensor->dtype()) !=
-        proto::VarType::FP32) {
-      return;
-    }
-
-    phi::DenseTensor cpu_tensor;
-    cpu_tensor.Resize(tensor->dims());
-    float* cpu_data = static_cast<float*>(
-        cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype()));
-
-    framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-    bool flag = false;
-    for (int i = 0; i < cpu_tensor.numel(); i++) {
-      if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
-        flag = true;
-        break;
-      }
-    }
-    PADDLE_ENFORCE_NE(
-        flag,
-        true,
-        platform::errors::Fatal(
-            "Operator %s output phi::DenseTensor %s contains Inf.",
-            op_type,
-            var_name));
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile "
-        "with NPU.",
-        var_name));
 #endif
     return;
   }
@@ -309,139 +273,6 @@ bool IsSkipOp(const framework::OperatorBase& op) {
   return false;
 }
 
-#ifdef PADDLE_WITH_ASCEND_CL
-using NpuOpRunner = paddle::operators::NpuOpRunner;
-
-constexpr int FLOAT_STATUS_SIZE = 8;
-
-static phi::DenseTensor& npu_float_status() {
-  static phi::DenseTensor float_status;
-  return float_status;
-}
-
-void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
-                                 const framework::Scope& scope,
-                                 const platform::Place& place) {
-  if (!platform::is_npu_place(place)) return;
-
-  std::call_once(white_list_init_flag, InitWhiteListFormEnv);
-  if (IsSkipOp(op)) return;
-
-  auto* dev_ctx = reinterpret_cast<platform::NPUDeviceContext*>(
-      platform::DeviceContextPool::Instance().Get(place));
-  auto stream = dev_ctx->stream();
-
-  auto& flag = npu_float_status();
-  flag.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
-  NpuOpRunner("NPUAllocFloatStatus", {}, {flag}).Run(stream);
-
-  phi::DenseTensor tmp;
-  tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
-  NpuOpRunner("NPUClearFloatStatus", {tmp}, {flag}).Run(stream);
-}
-
-void PrintNpuVarInfo(const std::string& op_type,
-                     const std::string& var_name,
-                     const framework::Variable* var,
-                     const platform::Place& place) {
-  const phi::DenseTensor* tensor{nullptr};
-  if (var->IsType<phi::DenseTensor>()) {
-    tensor = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::SelectedRows>()) {
-    tensor = &var->Get<phi::SelectedRows>().value();
-  } else {
-    VLOG(10) << var_name << " var_name need not to check";
-    return;
-  }
-
-  if ((framework::TransToProtoVarType(tensor->dtype()) !=
-       proto::VarType::FP32) &&
-      (framework::TransToProtoVarType(tensor->dtype()) !=
-       proto::VarType::FP16)) {
-    return;
-  }
-
-  if (tensor->memory_size() == 0) {
-    VLOG(10) << var_name << " var_name need not to check, size == 0";
-    return;
-  }
-
-  VLOG(10) << "begin check " << op_type << " var_name:" << var_name
-           << ", place:" << tensor->place() << ", numel:" << tensor->numel();
-
-  phi::DenseTensor cpu_tensor;
-  cpu_tensor.Resize(tensor->dims());
-  cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype());
-  framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-
-  LOG(WARNING) << "print [" << var_name << "] tensor info:";
-  // use env strategy control in future, -1=print_all.
-  int print_num = 3;
-  if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) {
-    const float* value = cpu_tensor.data<float>();
-    PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false);
-  } else if (framework::TransToProtoVarType(tensor->dtype()) ==
-             proto::VarType::FP16) {
-    const paddle::platform::float16* value =
-        cpu_tensor.data<paddle::platform::float16>();
-    PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false);
-  }
-}
-
-void PrintNPUOpValueInfo(const framework::OperatorBase& op,
-                         const framework::Scope& scope,
-                         const platform::Place& place) {
-  LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type()
-               << "), here we print some tensor value info of this op.";
-  for (auto& vname : op.InputVars()) {
-    auto* var = scope.FindVar(vname);
-    if (var == nullptr) continue;
-    PrintNpuVarInfo(op.Type(), vname, var, place);
-  }
-
-  for (auto& vname : op.OutputVars(true)) {
-    auto* var = scope.FindVar(vname);
-    if (var == nullptr) continue;
-    PrintNpuVarInfo(op.Type(), vname, var, place);
-  }
-}
-
-static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
-                                  const framework::Scope& scope,
-                                  const platform::Place& place) {
-  if (!platform::is_npu_place(place)) return;
-
-  auto* dev_ctx = reinterpret_cast<platform::NPUDeviceContext*>(
-      platform::DeviceContextPool::Instance().Get(place));
-  auto stream = dev_ctx->stream();
-
-  auto& flag = npu_float_status();
-  phi::DenseTensor tmp;
-  tmp.mutable_data<float>({FLOAT_STATUS_SIZE}, place);
-  // NPUGetFloatStatus updates data on input in-place.
-  // tmp is only placeholder.
-  NpuOpRunner("NPUGetFloatStatus", {flag}, {tmp}).Run(stream);
-
-  phi::DenseTensor cpu_tensor;
-  auto cpu_place = platform::CPUPlace();
-  float* cpu_data = static_cast<float*>(
-      cpu_tensor.mutable_data<float>({FLOAT_STATUS_SIZE}, cpu_place));
-
-  framework::TensorCopySync(flag, cpu_place, &cpu_tensor);
-  float sum = 0.0;
-  for (int i = 0; i < FLOAT_STATUS_SIZE; ++i) {
-    sum += cpu_data[i];
-  }
-
-  if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);
-
-  PADDLE_ENFORCE_LT(sum,
-                    1.0,
-                    platform::errors::PreconditionNotMet(
-                        "Operator %s contains Nan/Inf.", op.Type()));
-}
-#endif
-
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
                         const framework::Scope& exec_scope,
                         const platform::Place& place) {
@@ -449,13 +280,6 @@ void CheckOpHasNanOrInf(const framework::OperatorBase& op,
 
   if (IsSkipOp(op)) return;
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (platform::is_npu_place(place)) {
-    NPUCheckOpHasNanOrInf(op, exec_scope, place);
-    return;
-  }
-#endif
-
   if (op_var_nan_inf_white_list().count(op.Type()) == 0) {
     // NOTE. vname may destruct in the end of this func.
     for (auto& vname : op.OutputVars(true)) {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 770c51e0012dd..743513e38aad1 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -674,8 +674,7 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class SectionWorker : public DeviceWorker {
  public:
   SectionWorker() {}
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index 05e4fec365b69..ae01f622effa8 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -83,8 +83,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c35c1138df90c..c4384ea823f48 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -516,23 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
-#endif
-    } else if (platform::is_npu_place(place_)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      if (IsFastEagerDeletionModeEnabled()) {
-        VLOG(4) << "Use unsafe fast gc for NPU.";
-        gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Please set FLAGS_fast_eager_deletion_mode=true to use "
-            "GarbageCollector on NPU."));
-        // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
-        VLOG(4) << "Use default stream gc for NPU.";
-        gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
-      }
-#else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
 #endif
     } else if (platform::is_mlu_place(place_)) {
 #ifdef PADDLE_WITH_MLU
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 10fb82e23049f..7ebc58e61b588 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -124,10 +124,3 @@ cc_test(
   test_fleet_cc
   SRCS test_fleet.cc
   DEPS fleet_wrapper gloo_wrapper fs shell)
-
-if(WITH_ASCEND OR WITH_ASCEND_CL)
-  cc_library(
-    ascend_wrapper
-    SRCS ascend_wrapper.cc
-    DEPS framework_proto lod_tensor ascend_ge ascend_graph)
-endif()
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc
deleted file mode 100644
index 273939f6bee61..0000000000000
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
-namespace paddle {
-namespace framework {
-std::shared_ptr<AscendInstance> AscendInstance::ascend_instance_ = nullptr;
-}  // end namespace framework
-}  // end namespace paddle
-#endif
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
deleted file mode 100644
index 372f0e7d38be0..0000000000000
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <glog/logging.h>
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "ge/ge_api.h"
-#include "graph/attr_value.h"
-#include "graph/tensor.h"
-#include "graph/types.h"
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/timer.h"
-
-namespace paddle {
-namespace framework {
-
-typedef ge::Graph AscendGraphDesc;
-
-#ifdef PADDLE_WITH_ASCEND_STRING
-using AscendString = ge::AscendString;
-#else
-using AscendString = std::string;
-#endif
-
-class AscendInstance {
- public:
-  virtual ~AscendInstance() {}
-  AscendInstance() {}
-
-  std::map<AscendString, AscendString> _GetDefaultInitOptions() {
-    std::map<AscendString, AscendString> init_options;
-    init_options["ge.exec.deviceId"] = "0";
-    init_options["ge.graphRunMode"] = "1";
-    return init_options;
-  }
-
-  std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
-    std::map<AscendString, AscendString> init_options;
-    // init_options["a"] = "b";
-    // init_options["ge.trainFlag"] = "1";
-    return init_options;
-  }
-
-  ge::Status InitGEForUT() {
-    return ge::GEInitialize(_GetDefaultInitOptions());
-  }
-
-  void InitGlobalResouces() {
-    LOG(INFO) << "Begin ascend InitGlobalResouces";
-    session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
-    if (session_ == nullptr) {
-      PADDLE_THROW(platform::errors::Fatal("new session error: nullptr"));
-    }
-    LOG(INFO) << "End ascend InitGlobalResouces";
-  }
-
-  void DestroyGlobalResouces() {
-    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
-    session_ = nullptr;
-    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
-  }
-
-  static std::shared_ptr<AscendInstance> GetInstance() {
-    if (nullptr == ascend_instance_) {
-      ascend_instance_.reset(new paddle::framework::AscendInstance());
-      VLOG(1) << "Initialize AscendInstance Done";
-    }
-    return ascend_instance_;
-  }
-
-  void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) {
-    ge::Status status = session_->AddGraph(graph_idx, graph);
-    PADDLE_ENFORCE_EQ(status,
-                      ge::SUCCESS,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Calling addGraph of graph engine failed, please "
-                          "check Ascend Log."));
-    VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done";
-  }
-
-  ge::DataType VarTypeToGeType(proto::VarType::Type type) {
-    if (type == proto::VarType::FP16) {
-      return ge::DataType::DT_FLOAT16;
-    } else if (type == proto::VarType::FP32) {
-      return ge::DataType::DT_FLOAT;
-    } else if (type == proto::VarType::FP64) {
-      return ge::DataType::DT_DOUBLE;
-    } else if (type == proto::VarType::INT32) {
-      return ge::DataType::DT_INT32;
-    } else if (type == proto::VarType::INT64) {
-      return ge::DataType::DT_INT64;
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Not support %s as tensor type.", DataTypeToString(type)));
-    }
-  }
-  int GeTypeSize(proto::VarType::Type type) {
-    if (type == proto::VarType::FP16) {
-      return 2;
-    } else if (type == proto::VarType::FP32) {
-      return 4;
-    } else if (type == proto::VarType::FP64) {
-      return 8;
-    } else if (type == proto::VarType::INT32) {
-      return 4;
-    } else if (type == proto::VarType::INT64) {
-      return 8;
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Not support %s as tensor type.", DataTypeToString(type)));
-    }
-  }
-  ge::Tensor ConvertToGeTensor(const phi::DenseTensor *tensor) {
-    auto numel = tensor->numel();
-    std::vector<int64_t> vec_dim;
-    auto dimen = arity(tensor->dims());
-    for (auto i = 0; i < dimen; ++i) {
-      vec_dim.push_back(tensor->dims()[i]);
-    }
-    // For Debug
-    // VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() <<
-    // ", and shape is";
-    // for (const auto e : vec_dim) {
-    //   VLOG(0) << e;
-    // }
-
-    ge::Shape shape(vec_dim);
-    ge::TensorDesc tensor_desc(
-        shape,
-        ge::Format::FORMAT_ND,
-        VarTypeToGeType(framework::TransToProtoVarType(tensor->dtype())));
-    tensor_desc.SetRealDimCnt(vec_dim.size());
-
-    const uint8_t *data = reinterpret_cast<const uint8_t *>(tensor->data());
-    std::vector<uint8_t> dst(
-        numel * GeTypeSize(framework::TransToProtoVarType(tensor->dtype())));
-    memcpy(dst.data(),
-           data,
-           GeTypeSize(framework::TransToProtoVarType(tensor->dtype())) * numel);
-    ge::Tensor ge_tensor(tensor_desc, dst);
-    return ge_tensor;
-  }
-
-  void RunAscendSubgraph(int graph_idx,
-                         const std::vector<const phi::DenseTensor *> &inputs,
-                         std::vector<phi::DenseTensor *> *outputs) {
-    VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run.";
-    // Convert paddle phi::DenseTensor to GE phi::DenseTensor
-    std::vector<ge::Tensor> ge_inputs;
-    for (const auto &e : inputs) {
-      ge_inputs.push_back(ConvertToGeTensor(e));
-    }
-
-    // Run Graph
-    std::vector<ge::Tensor> ge_outputs;
-    ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs);
-    PADDLE_ENFORCE_EQ(status,
-                      ge::SUCCESS,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Calling RunGraph of graph engine failed, please "
-                          "check Ascend Log."));
-    VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done";
-
-    // change tensor back, note all tensor's type computed in GE is uint8
-    for (size_t i = 0; i < ge_outputs.size(); ++i) {
-      const uint8_t *ret_data = ge_outputs[i].GetData();
-      size_t size = ge_outputs[i].GetSize();
-      VLOG(1) << "GE phi::DenseTensor size of the " << i << "th output var is "
-              << size;
-      auto *dst = (*outputs)[i]->mutable_data<uint8_t>({(int64_t)size},
-                                                       platform::CPUPlace());
-      memcpy(dst, ret_data, size);
-
-      // Following for debug:
-      // VLOG(0) << "output for " << i << " var: ";
-      // float *tmp = reinterpret_cast<float*>(dst);
-      // for (size_t j = 0; j < size / 4; ++j) {
-      //   printf("%f ", tmp[j]);
-      // }
-      // printf("\n");
-    }
-  }
-
- protected:
-  std::shared_ptr<ge::Session> session_;
-
- private:
-  static std::shared_ptr<AscendInstance> ascend_instance_;
-};
-}  // namespace framework
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 77a666a24d9ea..7c4b3d5c440bd 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -125,32 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
 }
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
-    const platform::NPUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-
-void NPUDefaultStreamGarbageCollector::Wait() const {
-  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
-      ->WaitStreamCallback();
-}
-
-void NPUDefaultStreamGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
-      ->AddStreamCallback(callback);
-}
-NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
-    const platform::NPUPlace &place, size_t max_memory_size)
-    : GarbageCollector(place, max_memory_size) {}
-
-void NPUUnsafeFastGarbageCollector::ClearCallback(
-    const std::function<void()> &callback) {
-  callback();
-}
-
-#endif
-
 #ifdef PADDLE_WITH_MLU
 MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector(
     const platform::MLUPlace &place, size_t max_memory_size)
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index b75994536037a..14d38363dbe06 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -139,28 +139,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
 };
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-class NPUDefaultStreamGarbageCollector : public GarbageCollector {
- public:
-  NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
-                                   size_t max_memory_size);
-
-  void Wait() const override;
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-
-class NPUUnsafeFastGarbageCollector : public GarbageCollector {
- public:
-  NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
-                                size_t max_memory_size);
-
- protected:
-  void ClearCallback(const std::function<void()> &callback) override;
-};
-#endif
-
 #ifdef PADDLE_WITH_MLU
 class MLUDefaultStreamGarbageCollector : public GarbageCollector {
  public:
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
index 9de402450d5df..1e6a6f02e2230 100644
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -60,11 +60,6 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
       if (platform::is_xpu_place(place)) {
 #if defined(PADDLE_WITH_XPU)
         device_count = phi::backends::xpu::GetXPUDeviceCount();
-#endif
-      }
-      if (platform::is_npu_place(place)) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-        device_count = platform::GetNPUDeviceCount();
 #endif
       }
       if (platform::is_ipu_place(place)) {
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 8ba9e7a70e590..29626988132f9 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -631,16 +631,6 @@ void BuildOpFuncList(const platform::Place& place,
 
     VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
 
-#ifdef PADDLE_WITH_ASCEND_CL
-    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
-    // values, but only through special `float_status` to checks whether
-    // the operation is overflow. More about `float_status`, see:
-    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
-    if (FLAGS_check_nan_inf) {
-      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
-    }
-#endif
-
     try {
       if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
         VLOG(4) << "HandleOperatorBase";
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 8d38da543ad03..bee8e8ca7b795 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -87,16 +87,6 @@ inline void SetDeviceId(const platform::Place& place) {
 #else
     auto dev_id = place.device;
     platform::SetXPUDeviceId(dev_id);
-#endif
-  } else if (platform::is_npu_place(place)) {
-#ifndef PADDLE_WITH_ASCEND_CL
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Cannot run operator on place %s, please recompile paddle or "
-        "reinstall Paddle with NPU support.",
-        place));
-#else
-    auto dev_id = place.device;
-    platform::SetNPUDeviceId(dev_id);
 #endif
   } else if (platform::is_custom_place(place)) {
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
@@ -218,11 +208,6 @@ void InterpreterCore::RunImpl() {
     async_work_queue_ = GetWorkQueue();
     ExecuteInstructionList(vec_instruction_);
   }
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (platform::is_npu_place(place_)) {
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   if (platform::is_custom_place(place_)) {
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
@@ -893,18 +878,6 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) {
                                        : var_scope_.GetMutableScope();
   VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (platform::is_npu_place(place)) {
-    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the
-    // variable values, but only through special `float_status` to checks
-    // whether the operation is overflow. More about `float_status`, see:
-    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
-    if (FLAGS_check_nan_inf) {
-      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
-    }
-  }
-#endif
-
   auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
     // If it is OperatorBase, InferShape do nothing.
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 971b377c2afe9..6a46a03b9adce 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -770,16 +770,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
       auto dev_id = place.device;
       platform::SetXPUDeviceId(dev_id);
-#endif
-    } else if (platform::is_npu_place(place)) {
-#ifndef PADDLE_WITH_ASCEND_CL
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Cannot run operator on place %s, please recompile paddle or "
-          "reinstall Paddle with NPU support.",
-          place));
-#else
-      auto dev_id = place.device;
-      platform::SetNPUDeviceId(dev_id);
 #endif
     } else if (platform::is_mlu_place(place)) {
 #ifndef PADDLE_WITH_MLU
@@ -1692,17 +1682,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   bool fallback_to_cpu = false;
   auto* dev_ctx = pool.Get(place);
-
-#ifdef PADDLE_WITH_ASCEND_CL
-  // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
-  // values, but only through special `float_status` to checks whether
-  // the operation is overflow. More about `float_status`, see:
-  // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
-  if (FLAGS_check_nan_inf) {
-    framework::details::NPUAllocAndClearFloatStatus(*this, scope, place);
-  }
-#endif
-
   // using cache
   if (kernel_type_.get()) {
     dev_ctx = pool.Get(kernel_type_->place_);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 144651f1b63cc..1c703a25bea38 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -553,20 +553,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use IPU device since it's not compiled with IPU,"
           "Please recompile or reinstall Paddle with IPU support."));
-#endif
-    } else if (platform::is_npu_place(place)) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
-      } else {
-        gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size));
-      }
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use NPU device since it's not compiled with "
-          "NPU,"
-          "Please recompile or reinstall Paddle with NPU support."));
 #endif
     } else if (platform::is_custom_place(place)) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index f2fa4c24ae2ae..b4b25726964f3 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
         phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (kernel_key.backend() == phi::Backend::NPU) {
-    VLOG(3) << "phi missing NPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << kernel_key
-            << ", fallback to CPU one!";
-    return phi::KernelKey(
-        phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
   if (kernel_key.backend() == phi::Backend::MLU) {
     VLOG(3) << "phi missing MLU kernel: " << op.Type()
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 7bcb6ed6f14b5..bf3a0ea31cf25 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -37,8 +36,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   int place_id = section_config.place_id();
 #if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
   place_ = platform::CUDAPlace(place_id);
-#elif (defined PADDLE_WITH_ASCEND_CL)  // NOLINT
-  place_ = platform::NPUPlace(place_id);
 #endif
   worker_ = DeviceWorkerFactory::CreateDeviceWorker(
       trainer_desc.device_worker_name());
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index ed04b1622b04f..58e879a5011c2 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,8 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include <float.h>
 
 #include "paddle/fluid/framework/device_worker.h"
@@ -235,18 +234,6 @@ void SectionWorker::TrainFiles() {
         gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
       }
     }
-#elif defined(PADDLE_WITH_ASCEND_CL)
-    if (IsFastEagerDeletionModeEnabled()) {
-      VLOG(4) << "Use unsafe fast gc for NPU.";
-      gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Please set FLAGS_fast_eager_deletion_mode=true to use "
-          "GarbageCollector on NPU."));
-      // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
-      VLOG(4) << "Use default stream gc for NPU.";
-      gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
-    }
 #endif
   }  // max_memory_size >= 0
 
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 42690c071bc4c..5ef6f53d38d50 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -143,35 +143,6 @@ TEST(DenseTensor, MutableData) {
     EXPECT_EQ(p1, p2);
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  {
-    phi::DenseTensor src_tensor;
-    float* p1 = nullptr;
-    float* p2 = nullptr;
-    // initialization
-    p1 = src_tensor.mutable_data<float>(phi::make_ddim({1, 2, 3}),
-                                        platform::NPUPlace(0));
-    auto p1_holder = src_tensor.Holder();
-    EXPECT_NE(p1, nullptr);
-    // set src_tensor a new dim with large size
-    // momery is supposed to be re-allocated
-    p2 = src_tensor.mutable_data<float>(phi::make_ddim({3, 1024}),
-                                        platform::NPUPlace(0));
-    auto p2_holder = src_tensor.Holder();
-    EXPECT_NE(p2, nullptr);
-    EXPECT_NE(p1_holder.get(), p2_holder.get());
-    // set src_tensor a new dim with same size
-    // momery block is supposed to be unchanged
-    p1 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2, 3}),
-                                        platform::NPUPlace(0));
-    EXPECT_EQ(p1, p2);
-    // set src_tensor a new dim with smaller size
-    // momery block is supposed to be unchanged
-    p2 = src_tensor.mutable_data<float>(phi::make_ddim({2, 2}),
-                                        platform::NPUPlace(0));
-    EXPECT_EQ(p1, p2);
-  }
-#endif
 }
 
 TEST(DenseTensor, ShareDataWith) {
@@ -207,16 +178,6 @@ TEST(DenseTensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  {
-    phi::DenseTensor src_tensor;
-    phi::DenseTensor dst_tensor;
-    src_tensor.mutable_data<int>(phi::make_ddim({2, 3, 4}),
-                                 platform::NPUPlace(0));
-    dst_tensor.ShareDataWith(src_tensor);
-    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
-  }
-#endif
 }
 
 TEST(DenseTensor, Slice) {
@@ -271,33 +232,6 @@ TEST(DenseTensor, Slice) {
     EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
   }
 #endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-  {
-    phi::DenseTensor src_tensor;
-    src_tensor.mutable_data<double>(phi::make_ddim({6, 9}),
-                                    platform::NPUPlace(0));
-    phi::DenseTensor slice_tensor = src_tensor.Slice(2, 6);
-    phi::DDim slice_dims = slice_tensor.dims();
-    ASSERT_EQ(arity(slice_dims), 2);
-    EXPECT_EQ(slice_dims[0], 4);
-    EXPECT_EQ(slice_dims[1], 9);
-
-    uintptr_t src_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
-    uintptr_t src_mutable_data_address =
-        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
-            src_tensor.dims(), platform::NPUPlace(0)));
-    uintptr_t slice_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
-    uintptr_t slice_mutable_data_address =
-        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
-            slice_tensor.dims(), platform::NPUPlace(0)));
-    EXPECT_EQ(src_data_address, src_mutable_data_address);
-    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
-    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
-  }
-#endif
 }
 
 TEST(DenseTensor, ReshapeToMatrix) {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 91b87a98447ce..4c69bdd0ff502 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -125,112 +125,6 @@ void TensorCopyImpl(const TENSOR& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  // TODO(zhiqiu): handle different condition like CUDA code below
-  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {
-    auto stream =
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {
-    //  1. cpu tensor -> npu pinned tensor
-    platform::NPUPinnedPlace npu_pinned_place;
-    phi::DenseTensor npu_pinned_tensor;
-    npu_pinned_tensor.Resize(src.dims());
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data(npu_pinned_place, src.dtype());
-    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
-
-    //  2. async copy npu pinned tensor -> npu tensor
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        npu_pinned_place,
-        npu_pinned_ptr,
-        size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-
-    //  3. record event
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
-    npu_pinned_allocator->RecordEvent(
-        allocation,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-  }
-  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    auto stream =
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
-  }
-  else if (platform::is_npu_pinned_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {         /* npu_pinned->npu */
-    auto src_npu_pinned_place = src_place;
-    auto dst_npu_place = dst_place;
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(
-        platform::is_npu_place(ctx_place),
-        true,
-        platform::errors::PreconditionNotMet(
-            "Device context place mismatch. When copying phi::DenseTensor "
-            "data from NPU Pinned memory to NPU memory, current "
-            "device context place should be NPU."));
-    auto ctx_npu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(dst_npu_place,
-                      ctx_npu_place,
-                      platform::errors::PreconditionNotMet(
-                          "The target NPU device and current device context do "
-                          "not match. The target NPU device number is %d, but "
-                          "device context NPU number is %d.",
-                          dst_npu_place.device,
-                          ctx_npu_place.device));
-    auto stream =
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(
-        dst_npu_place, dst_ptr, src_npu_pinned_place, src_ptr, size, stream);
-  }
-  else if (platform::is_npu_place(src_place) &&        // NOLINT
-           platform::is_npu_pinned_place(dst_place)) { /* npu->npu_pinned */
-    auto src_npu_place = src_place;
-    auto dst_npu_pinned_place = dst_place;
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(
-        platform::is_npu_place(ctx_place),
-        true,
-        platform::errors::PreconditionNotMet(
-            "Device context place mismatch. When copying phi::DenseTensor "
-            "data from NPU memory to NPU Pinned memory, current "
-            "device context place should be NPU."));
-    auto ctx_npu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(src_place,
-                      ctx_npu_place,
-                      platform::errors::PreconditionNotMet(
-                          "The source NPU device and current device context do "
-                          "not match. The source NPU device number is %d, but "
-                          "device context NPU number is %d.",
-                          src_npu_place.device,
-                          ctx_npu_place.device));
-    auto stream =
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
-    memory::Copy(
-        dst_npu_pinned_place, dst_ptr, src_npu_place, src_ptr, size, stream);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
@@ -539,29 +433,6 @@ void TensorCopySync(const phi::DenseTensor& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_cpu_place(dst_place)) {  /* npu -> cpu*/
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else if (platform::is_cpu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {  /* cpu -> npu*/
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else if (platform::is_npu_place(src_place) &&  // NOLINT
-           platform::is_npu_place(dst_place)) {  /* npu -> npu*/
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
-              << dst_place;
-      return;
-    }
-    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
-  }
-  else {  // NOLINT
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Copy from %s to %s is not supported.", src_place, dst_place));
-  }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
@@ -758,31 +629,6 @@ void TensorToStream(std::ostream& os,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "MLUPlace is not supported when not compiled with MLU"));
-#endif
-    } else if (platform::is_npu_place(tensor.place())) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& npu_dev_ctx =
-          static_cast<const platform::NPUDeviceContext&>(dev_ctx);
-      platform::CPUPlace cpu;
-      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-      while (size != 0) {
-        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-        memory::Copy(cpu,
-                     buf.get(),
-                     tensor.place(),
-                     reinterpret_cast<const void*>(data),
-                     size_to_write,
-                     npu_dev_ctx.stream());
-        npu_dev_ctx.Wait();
-        os.write(buf.get(), size_to_write);
-        data += size_to_write;
-        size -= size_to_write;
-      }
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "NPUPlace is not supported when not compiled with NPU"));
 #endif
     } else if (platform::is_custom_place(tensor.place())) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -875,7 +721,7 @@ void TensorFromStream(std::istream& is,
         platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(shape));
       framework::VisitDataType(
@@ -958,7 +804,7 @@ void TensorFromStream(std::istream& is,
         platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(phi::make_ddim(dims));
       framework::VisitDataType(
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 35a612678cb3e..196487bda96aa 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -25,9 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/device_context.h"
@@ -145,37 +142,6 @@ void TensorFromArray(const T* src,
                  reinterpret_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    //  1. vector -> npu pinned tensor
-    platform::NPUPinnedPlace npu_pinned_place;
-    phi::DenseTensor npu_pinned_tensor;
-    npu_pinned_tensor.Resize(dst->dims());
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype());
-    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
-
-    //  2. async copy npu pinned tensor -> npu tensor
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        npu_pinned_place,
-        npu_pinned_ptr,
-        size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-
-    //  3. record event
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
-    npu_pinned_allocator->RecordEvent(
-        allocation,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
@@ -227,42 +193,6 @@ void TensorFromVector(const std::vector<T>& src,
                  reinterpret_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
-  // cudaMemcpyAsync.
-  // cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
-  // aclrtMemcpyAsync is really "async" between cpu <-> npu.
-  // Since vector is on cpu, I think this function should be a "sync" operation,
-  // so pass nullptr as stream to  memory::Copy().
-  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    //  1. vector -> npu pinned tensor
-    phi::DenseTensor npu_pinned_tensor(dst->dtype());
-    platform::NPUPinnedPlace npu_pinned_place;
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data<T>(dst->dims(), npu_pinned_place);
-    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
-
-    //  2. async copy npu pinned tensor -> npu tensor
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        npu_pinned_place,
-        npu_pinned_ptr,
-        size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-
-    //  3. record event
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
-    npu_pinned_allocator->RecordEvent(
-        allocation,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
@@ -324,37 +254,6 @@ inline void TensorFromVector(const std::vector<bool>& src,
                  reinterpret_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    //  1. vector -> npu pinned tensor
-    platform::NPUPinnedPlace npu_pinned_place;
-    phi::DenseTensor npu_pinned_tensor;
-    npu_pinned_tensor.Resize(dst->dims());
-    auto npu_pinned_ptr =
-        npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype());
-    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
-
-    //  2. async copy npu pinned tensor -> npu tensor
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        npu_pinned_place,
-        npu_pinned_ptr,
-        size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-
-    //  3. record event
-    auto npu_pinned_allocator =
-        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
-            paddle::memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(npu_pinned_place)
-                .get());
-    phi::Allocation* allocation = npu_pinned_tensor.Holder().get();
-    npu_pinned_allocator->RecordEvent(
-        allocation,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
-  }
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(dst_place)) {  // NOLINT
     auto stream =
@@ -433,11 +332,6 @@ void TensorToVector(const phi::DenseTensor& src,
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(src.place())) {  // NOLINT
     memory::Copy(
@@ -491,11 +385,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
-  }
-#endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
@@ -566,11 +455,6 @@ inline T GetValue(const phi::DenseTensor* x) {
   if (!platform::is_cpu_place(x->place())) {
     phi::DenseTensor cpu_x;
     framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    const platform::DeviceContext* dev_ctx = pool.Get(x->place());
-    dev_ctx->Wait();
-#endif
     value = cpu_x.data<T>()[0];
   } else {
     value = x->data<T>()[0];
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 9097c43023bd2..bda2681f57f31 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -299,32 +299,6 @@ TEST(TensorToVector, Tensor_bool) {
     }
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  {
-    std::vector<bool> src_vec = {
-        false,
-        true,
-        false,
-        true,
-        false,
-        true,
-        false,
-        true,
-        false,
-    };
-    phi::DenseTensor npu_tensor;
-    paddle::platform::NPUPlace place(0);
-    paddle::platform::NPUDeviceContext npu_ctx(place);
-    paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
-
-    std::vector<bool> dst;
-    paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
-
-    for (int i = 0; i < 3 * 3; ++i) {
-      EXPECT_EQ(src_vec[i], dst[i]);
-    }
-  }
-#endif
 }
 
 TEST(TensorFromDLPack, Tensor) {
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 08696e4112db9..455487541abb9 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -302,8 +302,7 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class PipelineTrainer : public TrainerBase {
  public:
   PipelineTrainer() {}
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 48ea9143d621a..16aa069a0c33a 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -82,8 +82,7 @@ REGISTER_TRAINER_CLASS(HeterXpuTrainer);
     (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 1b43bd25eeef0..961b7c1e663c0 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -65,28 +65,6 @@ using Attribute = paddle::variant<paddle::blank,
                                   std::vector<paddle::experimental::Scalar>>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
-#ifdef PADDLE_WITH_ASCEND_CL
-using NPUAttribute = paddle::variant<paddle::blank,
-                                     int,
-                                     float,
-                                     std::string,
-                                     std::vector<int>,
-                                     std::vector<float>,
-                                     std::vector<std::string>,
-                                     bool,
-                                     std::vector<bool>,
-                                     BlockDesc*,
-                                     int64_t,
-                                     std::vector<BlockDesc*>,
-                                     std::vector<int64_t>,
-                                     std::vector<double>,
-                                     VarDesc*,
-                                     std::vector<VarDesc*>,
-                                     std::vector<std::vector<int64_t>>>;
-
-using NPUAttributeMap = std::unordered_map<std::string, NPUAttribute>;
-#endif
-
 using OpCreator =
     std::function<OperatorBase*(const std::string& /*type*/,
                                 const VariableNameMap& /*inputs*/,
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index fab9d28abbac4..aed9f4fc76f39 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -39,11 +39,6 @@
 #endif
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <hccl/hccl.h>
-#include <hccl/hccl_types.h>
-#endif
-
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -69,10 +64,6 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-class Communicator;
-class HCCLCommunicator;
-#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -205,9 +196,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
     operators::CudnnRNNCache,
 #endif
-#if defined(PADDLE_WITH_ASCEND_CL)
-    HcclRootInfo,
-#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId,
     platform::BKCLCommunicator,
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 503ea531f171a..3e0d3348bc790 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -36,49 +36,6 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-#ifdef PADDLE_WITH_ASCEND_CL
-void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
-  if (!argument->use_npu()) return;
-
-  auto &graph = argument->main_graph();
-  std::vector<std::string> repetitive_params;
-
-  if (graph.Has(framework::ir::kRepetitiveParamAttr))
-    repetitive_params = graph.Get<std::vector<std::string>>(
-        framework::ir::kRepetitiveParamAttr);
-
-  LOG(INFO) << "Sync params from CPU to NPU";
-
-  PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(),
-                    true,
-                    platform::errors::PreconditionNotMet(
-                        "The npu_device_id field should be valid"));
-  platform::Place place = platform::NPUPlace(argument->npu_device_id());
-  auto *scope = argument->scope_ptr();
-  std::vector<std::string> all_vars = scope->LocalVarNames();
-
-  for (auto &var_name : all_vars) {
-    auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        platform::errors::PreconditionNotMet("The var should not be nullptr"));
-
-    if (var->IsType<phi::DenseTensor>()) {
-      auto *t = var->GetMutable<phi::DenseTensor>();
-
-      platform::CPUPlace cpu_place;
-      phi::DenseTensor temp_tensor;
-      temp_tensor.Resize(t->dims());
-      temp_tensor.mutable_data<float>(cpu_place);
-
-      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
-      t->clear();
-      paddle::framework::TensorCopySync(temp_tensor, place, t);
-    }
-  }
-}
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
@@ -253,11 +210,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       argument->scope_valid(),
       true,
       platform::errors::PreconditionNotMet("The scope field should be valid"));
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (argument->use_npu_valid()) {
-    CopyParamsToNpu(argument);
-  }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (argument->use_gpu_valid()) {
     CopyParamsToGpu(argument);
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index 3ffecc72a50f5..9db17abc24d2a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -35,10 +35,6 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-#ifdef PADDLE_WITH_ASCEND_CL
-  void CopyParamsToNpu(Argument *argument);
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void CopyParamsToGpu(Argument *argument);
 #endif
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 7ae0d0f636588..b0f53c1f639ac 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -195,21 +195,6 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
   Update();
 }
 
-void AnalysisConfig::EnableNpu(int device_id) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-  use_npu_ = true;
-  npu_device_id_ = device_id;
-#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
-  use_custom_device_ = true;
-  custom_device_id_ = device_id;
-  custom_device_type_ = "npu";
-#else
-  LOG(ERROR) << "Please compile with npu to EnableNpu()";
-  use_npu_ = false;
-#endif
-  Update();
-}
-
 void AnalysisConfig::EnableCustomDevice(const std::string &device_type,
                                         int device_id,
                                         Precision precision_mode) {
@@ -1023,20 +1008,6 @@ void AnalysisConfig::Update() {
         "with XPU-runtime."));
 #endif
   }
-
-  if (use_npu_) {
-#if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU)
-    PADDLE_ENFORCE_EQ(use_gpu_,
-                      false,
-                      platform::errors::Unavailable(
-                          "Currently, NPU and GPU cannot be enabled in the "
-                          "same analysis configuration."));
-#else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "You tried to use an NPU device, but Paddle was not compiled "
-        "with NPU-runtime."));
-#endif
-  }
   if (use_ipu_) {
 #ifndef PADDLE_WITH_IPU
     PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 91dee8a9ae4ee..ce47e9ff5e48e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -376,14 +376,6 @@ void AnalysisPredictor::InitPlace() {
           "with WITH_XPU."));
 #endif  // PADDLE_WITH_XPU
     }
-  } else if (config_.use_npu()) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    place_ = paddle::platform::NPUPlace(config_.npu_device_id());
-#else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "You tried to use NPU forward propagation, but Paddle was not compiled "
-        "with WITH_ASCEND_CL."));
-#endif
   } else if (config_.NNAdapter().use_nnadapter) {
     if (config_.lite_engine_enabled()) {
       place_ = paddle::platform::CPUPlace();
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 3a77c1b878aba..0d5c8f98020a8 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -278,23 +278,6 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 #else
       PADDLE_THROW(platform::errors::Unavailable(
           "Not compile with XPU, should not reach here."));
-#endif
-    } else {
-#ifdef PADDLE_WITH_ASCEND_CL
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto *dev_ctx =
-          static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
-      auto dst_npu_place = place_;
-      memory::Copy(dst_npu_place,
-                   static_cast<void *>(input_ptr),
-                   platform::CPUPlace(),
-                   inputs[i].data.data(),
-                   inputs[i].data.length(),
-                   dev_ctx->stream());
-#else
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Not compile with NPU, should not reach here."));
 #endif
     }
 
diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 67dc193feed09..1416dacb833d9 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -305,15 +305,6 @@ TEST(inference_api_native, image_classification_xpu) {
 }
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-TEST(inference_api_native, word2vec_npu) {
-  MainWord2Vec(paddle::PaddlePlace::kNPU);
-}
-// TEST(inference_api_native, image_classification_npu) {
-//   MainImageClassification(paddle::PaddlePlace::kNPU);
-// }
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(inference_api_native, word2vec_gpu) {
   MainWord2Vec(paddle::PaddlePlace::kGPU);
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 0a0a27bb6a6a0..52204ff3658f4 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -244,25 +244,6 @@ void Tensor::CopyFromCpu(const T *data) {
     PADDLE_THROW(paddle::platform::errors::Unavailable(
         "Can not create tensor with XPU place because paddle is not compiled "
         "with XPU."));
-#endif
-  } else if (place_ == PlaceType::kNPU) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    paddle::platform::DeviceContextPool &pool =
-        paddle::platform::DeviceContextPool::Instance();
-    paddle::platform::NPUPlace npu_place(device_);
-    auto *t_data = tensor->mutable_data<T>(npu_place);
-    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
-        pool.Get(npu_place));
-    paddle::memory::Copy(npu_place,
-                         static_cast<void *>(t_data),
-                         paddle::platform::CPUPlace(),
-                         data,
-                         ele_size,
-                         dev_ctx->stream());
-#else
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "Can not create tensor with NPU place because paddle is not compiled "
-        "with NPU."));
 #endif
   } else {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -468,25 +449,6 @@ void Tensor::CopyToCpuImpl(T *data,
     PADDLE_THROW(paddle::platform::errors::Unavailable(
         "Can not create tensor with XPU place because paddle is not compiled "
         "with XPU."));
-#endif
-  } else if (place_ == PlaceType::kNPU) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    paddle::platform::DeviceContextPool &pool =
-        paddle::platform::DeviceContextPool::Instance();
-    auto npu_place = t_place;
-    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
-        pool.Get(npu_place));
-    paddle::memory::Copy(paddle::platform::CPUPlace(),
-                         static_cast<void *>(data),
-                         npu_place,
-                         t_data,
-                         ele_num * sizeof(T),
-                         dev_ctx->stream());
-    paddle::platform::NPUStreamSync(dev_ctx->stream());
-#else
-    PADDLE_THROW(paddle::platform::errors::Unavailable(
-        "Can not create tensor with NPU place because paddle is not compiled "
-        "with NPU."));
 #endif
   } else {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
index 1a6f1a2669b89..c3589f4251791 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
@@ -146,10 +146,6 @@ TEST(Tensor, FillRandomDataAndCheck) {
   ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kGPU));
   ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kGPU));
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kNPU));
-  ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kNPU));
-#endif
 #ifdef PADDLE_WITH_XPU
   ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kXPU));
   ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kXPU));
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 63401f2fec6cf..585f12e4d07d7 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -363,12 +363,6 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void SetXpuDeviceId(int device_id = 0);
   ///
-  /// \brief Turn on NPU.
-  ///
-  /// \param device_id device_id the NPU card to use (default is 0).
-  ///
-  void EnableNpu(int device_id = 0);
-  ///
   /// \brief Turn on CustomDevice.
   ///
   /// \param device_type device_type the custom device to use.
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
index dd0979274f75d..9b19874e0b907 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.cc
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -171,11 +171,6 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
                     enable_multi_stream);
 }
 
-void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) {
-  CHECK_AND_CONVERT_PD_CONFIG;
-  config->EnableNpu(device_id);
-}
-
 PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
   CHECK_AND_CONVERT_PD_CONFIG;
   return config->use_xpu();
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index 19e1a1c139d4c..a2e050f9f7306 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -214,14 +214,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
     PD_Bool adaptive_seqlen,
     PD_Bool enable_multi_stream);
 ///
-/// \brief Turn on NPU.
-///
-/// \param[in] pd_onfig config
-/// \param[in] device_id device_id the NPU card to use.
-///
-PADDLE_CAPI_EXPORT extern void PD_ConfigEnableNpu(
-    __pd_keep PD_Config* pd_config, int32_t device_id);
-///
 /// \brief A boolean state telling whether the XPU is turned on.
 ///
 /// \param[in] pd_onfig config
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index 72c5ab078c83d..c24b941e33e3c 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -212,15 +212,6 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo
 		cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream))
 }
 
-///
-/// \brief Turn on NPU.
-///
-/// \param deviceId the NPU card to use.
-///
-func (config *Config) EnableNpu(deviceId int32) {
-	C.PD_ConfigEnableNpu(config.c, C.int32_t(deviceId))
-}
-
 ///
 /// \brief A boolean state telling whether the GPU is turned on.
 ///
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index f7c57fa2b02d6..4dc408241f476 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -50,11 +50,6 @@ if(UNIX AND NOT APPLE)
   list(APPEND ALLOCATOR_DEPS rt)
 endif()
 
-if(WITH_ASCEND_CL)
-  list(APPEND ALLOCATOR_SRCS npu_allocator.cc npu_pinned_allocator.cc)
-  list(APPEND ALLOCATOR_DEPS npu_info)
-endif()
-
 if(WITH_CUSTOM_DEVICE)
   list(APPEND ALLOCATOR_SRCS custom_allocator.cc)
 endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 029288f153923..42b331298ffa0 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -54,10 +54,6 @@
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
-
 #ifdef PADDLE_WITH_IPU
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
@@ -198,12 +194,6 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
         }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
-        }
-        InitNaiveBestFitNPUPinnedAllocator();
-#endif
 #ifdef PADDLE_WITH_MLU
         for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
           InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
@@ -254,12 +244,6 @@ class AllocatorFacadePrivate {
 
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
-        }
-        InitNaiveBestFitNPUPinnedAllocator();
-#endif
 #ifdef PADDLE_WITH_XPU
         for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
@@ -823,17 +807,6 @@ class AllocatorFacadePrivate {
   }
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
-    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-  }
-
-  void InitNaiveBestFitNPUPinnedAllocator() {
-    allocators_[platform::NPUPinnedPlace()] =
-        std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
-  }
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
@@ -915,12 +888,6 @@ class AllocatorFacadePrivate {
       places.emplace_back(platform::XPUPlace(dev_id));
     }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-    int device_count = platform::GetNPUDeviceCount();
-    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
-      places.emplace_back(platform::NPUPlace(dev_id));
-    }
-#endif
 #ifdef PADDLE_WITH_IPU
     int device_count = platform::GetIPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
@@ -1107,7 +1074,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
   } else {
     return m->GetAllocator(p, size)->Allocate(size);
   }
-#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
+#elif defined(PADDLE_WITH_XPU)
   return GetAllocator(place)->Allocate(size);
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 3ed758219783c..7f10b2286b4e7 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -16,9 +16,6 @@
 #include <memory>
 
 #include "paddle/fluid/memory/allocation/allocator.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -29,10 +26,6 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-#ifdef PADDLE_WITH_ASCEND_CL
-using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
-#endif
-
 // Allocator Facade is the interface exposed to other modules.
 // All the configuration or dirty code under development should
 // be hidden behind this facade.
diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc
index 907fd37e44205..9a43da132086c 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #define USE_DEVICE
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
@@ -57,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     init_allocate_size_func_ = &platform::GpuInitAllocSize;
     re_allocate_size_func_ = &platform::GpuReallocSize;
-#elif defined(PADDLE_WITH_ASCEND_CL)
-    init_allocate_size_func_ = &platform::NPUInitAllocSize;
-    re_allocate_size_func_ = &platform::NPUReallocSize;
 #elif defined(PADDLE_WITH_MLU)
     init_allocate_size_func_ = &platform::MLUInitAllocSize;
     re_allocate_size_func_ = &platform::MLUReallocSize;
@@ -257,9 +253,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   allocate_bytes = DeviceAllocateSize(
       &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
-#elif defined(PADDLE_WITH_ASCEND_CL)
-  allocate_bytes = DeviceAllocateSize(
-      &platform::NPUInitAllocSize, &platform::NPUReallocSize, request_bytes);
 #elif defined(PADDLE_WITH_MLU)
   allocate_bytes = DeviceAllocateSize(
       &platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes);
diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
index 315d6649c5d77..e69e773a15f67 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -29,8 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -396,34 +395,6 @@ TEST(BuddyAllocator, Release) {
 }
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-TEST(BuddyAllocator, NpuFraction) {
-  // In a 16 GB machine, the pool size will be about 160 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.92;
-  FLAGS_initial_gpu_memory_in_mb = 0;
-  FLAGS_reallocate_gpu_memory_in_mb = 0;
-
-  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
-      platform::NPUMinChunkSize(),
-      platform::NPUMaxChunkSize());
-
-  // Less than pool size
-  TestBuddyAllocator(&buddy_allocator, 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 10);
-  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  buddy_allocator.Release();
-
-  // Greater than max chunk size
-  TestBuddyAllocator(&buddy_allocator,
-                     300 << 20,
-                     /* use_system_allocator = */ true);
-  TestBuddyAllocator(&buddy_allocator,
-                     1 * static_cast<size_t>(1 << 30),
-                     /* use_system_allocator = */ true);
-}
-#endif
-
 #ifdef PADDLE_WITH_MLU
 TEST(BuddyAllocator, MluFraction) {
   // In a 16 GB machine, the pool size will be about 160 MB
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 4bcfdb1aaf424..a6c8d2f3bdb03 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -213,210 +213,6 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #endif
 }
 
-// For Ascend NPU
-#ifdef PADDLE_WITH_ASCEND_CL
-constexpr int EXTRA_PADDING_SIZE = 32;
-class NPUBuddyAllocatorList {
- private:
-  NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
-    auto npu_num = devices_.size();
-    allocators_.resize(npu_num);
-    init_flags_.reserve(npu_num);
-    for (size_t i = 0; i < npu_num; ++i) {
-      init_flags_.emplace_back(new std::once_flag());
-    }
-  }
-
-  static NPUBuddyAllocatorList *CreateNewInstance() {
-    return new NPUBuddyAllocatorList();
-  }
-
- public:
-  static NPUBuddyAllocatorList *Instance() {
-    static auto *instance = CreateNewInstance();
-    return instance;
-  }
-
-  BuddyAllocator *Get(int npu_id) {
-    auto pos = std::distance(
-        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
-    PADDLE_ENFORCE_LT(pos,
-                      devices_.size(),
-                      platform::errors::OutOfRange(
-                          "The index exceeds the size of devices, the size of "
-                          "devices is %d, the index is %d",
-                          devices_.size(),
-                          pos));
-
-    std::call_once(*init_flags_[pos], [this, pos] {
-      platform::SetNPUDeviceId(devices_[pos]);
-      allocators_[pos].reset(
-          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                 new detail::NPUAllocator(devices_[pos])),
-                             platform::NPUMinChunkSize(),
-                             platform::NPUMaxChunkSize(),
-                             EXTRA_PADDING_SIZE));
-      VLOG(10) << "\n\nNOTE:\n"
-               << "You can set GFlags environment variable "
-               << "'FLAGS_fraction_of_gpu_memory_to_use' "
-               << "or 'FLAGS_initial_gpu_memory_in_mb' "
-               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
-               << "to change the memory size for GPU usage.\n"
-               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
-               << FLAGS_fraction_of_gpu_memory_to_use
-               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
-               << FLAGS_initial_gpu_memory_in_mb
-               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
-               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
-    });
-
-    return allocators_[pos].get();
-  }
-
- private:
-  std::vector<int> devices_;
-  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
-  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
-};
-
-BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
-  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
-}
-
-BuddyAllocator *GetNPUPinnedBuddyAllocator() {
-  static std::once_flag init_flag;
-  static BuddyAllocator *ba = nullptr;
-
-  std::call_once(init_flag, []() {
-    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                new detail::NPUPinnedAllocator),
-                            phi::backends::cpu::NPUPinnedMinChunkSize(),
-                            phi::backends::cpu::NPUPinnedMaxChunkSize());
-  });
-
-  return ba;
-}
-
-#endif
-
-template <>
-size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  return GetNPUBuddyAllocator(place.device)->Used();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
-  auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    platform::NPUDeviceGuard(place.device);
-    size_t avail, total;
-    platform::NPUMemoryUsage(&avail, &total);
-    PADDLE_THROW(platform::errors::ResourceExhausted(
-        "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
-        "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
-        string::HumanReadableSize(size),
-        place.device,
-        string::HumanReadableSize(avail),
-        string::HumanReadableSize(total),
-        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
-        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
-        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
-  } else {
-    if (FLAGS_init_allocated_mem) {
-      platform::NPUMemsetSync(ptr, 0xEF, size, size);
-    }
-  }
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
-  return ptr;
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-void Free<platform::NPUPlace>(const platform::NPUPlace &place,
-                              void *p,
-                              size_t size) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  GetNPUBuddyAllocator(place.device)->Free(p);
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  return GetNPUBuddyAllocator(place.device)->Release();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-size_t Used<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  return GetNPUPinnedBuddyAllocator()->Used();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPinnedPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
-                                      size_t size) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  auto *buddy_allocator = GetNPUPinnedBuddyAllocator();
-  void *ptr = buddy_allocator->Alloc(size);
-
-  if (ptr == nullptr) {
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in NPUPinnedPlace";
-  }
-  if (FLAGS_init_allocated_mem) {
-    memset(ptr, 0xEF, size);
-  }
-  return ptr;
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPinnedPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
-                                    void *p,
-                                    size_t size) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  GetNPUPinnedBuddyAllocator()->Free(p);
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPinnedPlace' is not supported in CPU only device."));
-#endif
-}
-
-template <>
-uint64_t Release<platform::NPUPinnedPlace>(
-    const platform::NPUPinnedPlace &place) {
-#ifdef PADDLE_WITH_ASCEND_CL
-  return GetNPUPinnedBuddyAllocator()->Release();
-#else
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "'NPUPinnedPlace' is not supported in CPU only device."));
-#endif
-}
-
 // For CUDA
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUBuddyAllocatorList {
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
index 7d5cb5200a6a4..6f4f901d986fd 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -61,22 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
-  NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
-  {
-    size_t size = (1 << 20);
-    auto allocation = alloc.Allocate(size);
-  }
-  sleep(10);
-  alloc.Release(platform::NPUPlace(0));
-
-  size_t size = (1 << 20);
-  auto allocation = alloc.Allocate(size);
-  alloc.Release(platform::NPUPlace(0));
-}
-#endif
-
 #ifdef PADDLE_WITH_MLU
 TEST(NaiveBestFitAllocatorTest, MluAlloc) {
   NaiveBestFitAllocator alloc{platform::MLUPlace(0)};
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
deleted file mode 100644
index a4f253ba657e9..0000000000000
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/npu_allocator.h"
-
-#include <string>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-bool NPUAllocator::IsAllocThreadSafe() const { return true; }
-void NPUAllocator::FreeImpl(phi::Allocation* allocation) {
-  PADDLE_ENFORCE_EQ(
-      allocation->place(),
-      place_,
-      platform::errors::PermissionDenied(
-          "NPU memory is freed in incorrect device. This may be a bug"));
-  platform::RecordedNPUFree(
-      allocation->ptr(), allocation->size(), place_.device);
-  delete allocation;
-}
-
-phi::Allocation* NPUAllocator::AllocateImpl(size_t size) {
-  std::call_once(once_flag_,
-                 [this] { platform::SetNPUDeviceId(place_.device); });
-
-  void* ptr;
-  auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
-  if (LIKELY(result == ACL_ERROR_NONE)) {
-    return new Allocation(ptr, size, platform::Place(place_));
-  }
-
-  size_t avail, total, actual_avail, actual_total;
-  bool is_limited = platform::RecordedNPUMemGetInfo(
-      &avail, &total, &actual_avail, &actual_total, place_.device);
-
-  std::string err_msg;
-  if (is_limited) {
-    auto limit_size = (total >> 20);
-    err_msg = string::Sprintf(
-        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
-        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
-        "GPU memory usage is limited to %d MB.\n"
-        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-        limit_size,
-        limit_size);
-  }
-
-  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
-      "\n\nOut of memory error on NPU %d. "
-      "Cannot allocate %s memory on NPU %d, "
-      "available memory is only %s.\n\n"
-      "Please check whether there is any other process using NPU %d.\n"
-      "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
-      "2. If no, please decrease the batch size of your model. %s\n\n",
-      place_.device,
-      string::HumanReadableSize(size),
-      place_.device,
-      string::HumanReadableSize(avail),
-      place_.device,
-      err_msg));
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
deleted file mode 100644
index 04832c6fd9b63..0000000000000
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class NPUAllocator : public Allocator {
- public:
-  explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
-
-  bool IsAllocThreadSafe() const override;
-
- protected:
-  void FreeImpl(phi::Allocation* allocation) override;
-  phi::Allocation* AllocateImpl(size_t size) override;
-
- private:
-  platform::NPUPlace place_;
-  std::once_flag once_flag_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
deleted file mode 100644
index db76cbaace4c3..0000000000000
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-void NPUPinnedAllocator::ProcessEventsAndFree() {
-  for (auto it = npu_events_.begin(); it != npu_events_.end();) {
-    aclrtEvent event = it->second;
-    aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-    platform::NPUEventQuery(event, &status);
-
-    if (status == ACL_EVENT_STATUS_COMPLETE) {
-      auto *allocation = it->first;
-      void *ptr = allocation->ptr();
-      free(ptr);
-      npu_events_.erase(it++);
-      delete allocation;
-      platform::NPUEventDestroy(event);
-    } else {
-      ++it;
-    }
-  }
-}
-
-phi::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
-  std::lock_guard<std::mutex> lock(mtx_);
-  ProcessEventsAndFree();
-  void *ptr;
-  int error = posix_memalign(&ptr, kAlignment, size);
-  PADDLE_ENFORCE_EQ(
-      error,
-      0,
-      platform::errors::ResourceExhausted(
-          "Fail to alloc memory of %ld size, error code is %d.", size, error));
-  return new Allocation(ptr, size, platform::NPUPinnedPlace());
-}
-
-void NPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
-  std::lock_guard<std::mutex> lock(mtx_);
-  void *ptr = allocation->ptr();
-  auto iter = npu_events_.find(allocation);
-
-  // Managed by GC if not called RecordEvent.
-  if (iter == npu_events_.end()) {
-    // double free? No such problem has been found so far.
-    // Or maybe we need a set<Allocation*> to record which
-    // Allocation managed by GC.
-    free(ptr);
-    delete allocation;
-    return;
-  }
-
-  aclrtEvent event = iter->second;
-  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-  platform::NPUEventQuery(event, &status);
-  if (status == ACL_EVENT_STATUS_COMPLETE) {
-    free(ptr);
-    npu_events_.erase(allocation);
-    delete allocation;
-    platform::NPUEventDestroy(event);
-  }
-  return;
-}
-
-uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
-  std::lock_guard<std::mutex> lock(mtx_);
-  // Empty implementation
-  return static_cast<uint64_t>(0);
-}
-
-void NPUPinnedAllocator::RecordEvent(phi::Allocation *allocation,
-                                     aclrtStream stream) {
-  std::lock_guard<std::mutex> lock(mtx_);
-  aclrtEvent event = nullptr;
-  platform::NPUEventCreate(&event);
-  platform::NPUEventRecord(event, stream);
-  npu_events_.insert({allocation, event});
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
deleted file mode 100644
index 80d545e507ec3..0000000000000
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-
-#include "acl/acl.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class NPUPinnedAllocator : public Allocator {
- public:
-  bool IsAllocThreadSafe() const override { return true; }
-  void ProcessEventsAndFree();
-  void RecordEvent(phi::Allocation *allocation, aclrtStream stream);
-  constexpr static size_t kAlignment = 4096UL;
-
- protected:
-  phi::Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(phi::Allocation *allocation) override;
-  uint64_t ReleaseImpl(const platform::Place &place) override;
-
- private:
-  std::unordered_map<phi::Allocation *, aclrtEvent> npu_events_;
-  mutable std::mutex mtx_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index 6818c3c0f5593..ddd916817e756 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -287,135 +287,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
 
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-void* NPUAllocator::Alloc(size_t* index, size_t size) {
-  if (size <= 0) return nullptr;
-
-  void* p;
-  auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
-
-  if (result == ACL_ERROR_NONE) {
-    *index = 0;
-    npu_alloc_size_ += size;
-    return p;
-  } else {
-    size_t avail, total, actual_avail, actual_total;
-    bool is_limited = platform::RecordedNPUMemGetInfo(
-        &avail, &total, &actual_avail, &actual_total, npu_id_);
-
-    std::string err_msg;
-    if (is_limited) {
-      auto limit_size = (total >> 20);
-      err_msg = string::Sprintf(
-          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
-          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
-          "maximum GPU memory usage is limited to %d MB.\n"
-          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size,
-          limit_size);
-    }
-
-    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
-        "\n\nOut of memory error on NPU %d. "
-        "Cannot allocate %s memory on NPU %d, "
-        "available memory is only %s.\n\n"
-        "Please check whether there is any other process using NPU %d.\n"
-        "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
-        "2. If no, please try one of the following suggestions:\n"
-        "   1) Decrease the batch size of your model.\n"
-        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
-        "please set it to a higher value but less than 1.0.\n"
-        "      The command is "
-        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        npu_id_,
-        string::HumanReadableSize(size),
-        npu_id_,
-        string::HumanReadableSize(avail),
-        npu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use,
-        err_msg));
-  }
-}
-
-void NPUAllocator::Free(void* p, size_t size, size_t index) {
-  VLOG(4) << "Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(npu_alloc_size_,
-                    size,
-                    platform::errors::InvalidArgument(
-                        "The size of memory (%d) to free exceeds the size of "
-                        "allocated gpu memory (%d)",
-                        size,
-                        npu_alloc_size_));
-  npu_alloc_size_ -= size;
-
-  platform::RecordedNPUFree(p, size, npu_id_);
-}
-
-bool NPUAllocator::UseGpu() const { return true; }
-
-void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
-  if (size <= 0) return nullptr;
-
-  size_t usable =
-      phi::backends::cpu::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
-
-  if (size > usable) {
-    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
-                 << " MB pinned memory."
-                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
-    return nullptr;
-  }
-
-  void* p;
-  // PINNED memory is visible to all NPU contexts.
-  auto result = platform::NPUHostMalloc(&p, size);
-
-  if (result == ACL_ERROR_NONE) {
-    *index = 1;  // PINNED memory
-    npu_pinnd_alloc_size_ += size;
-    return p;
-  } else {
-    LOG(WARNING) << "NPUHostMalloc failed.";
-    return nullptr;
-  }
-
-  return nullptr;
-}
-
-void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
-  aclError err;
-  PADDLE_ENFORCE_EQ(index,
-                    1,
-                    platform::errors::InvalidArgument(
-                        "The index should be 1, but got %d", index));
-
-  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
-                    size,
-                    platform::errors::InvalidArgument(
-                        "The size of memory (%d) to free exceeds the size of "
-                        "allocated npu pinned memory (%d)",
-                        size,
-                        npu_pinnd_alloc_size_));
-  npu_pinnd_alloc_size_ -= size;
-  err = platform::NPUHostFree(p);
-
-  if (err != ACL_ERROR_NONE) {
-    PADDLE_ENFORCE_EQ(
-        err,
-        0,
-        platform::errors::Fatal(
-            "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
-  }
-}
-
-bool NPUPinnedAllocator::UseGpu() const { return false; }
-
-#endif
-
 #ifdef PADDLE_WITH_MLU
 void* MLUAllocator::Alloc(size_t* index, size_t size) {
   if (size <= 0) return nullptr;
diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h
index 18c2e278f99c5..bb1a4ee998174 100644
--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -68,32 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-
-class NPUAllocator : public SystemAllocator {
- public:
-  explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
-
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
-
- private:
-  size_t npu_alloc_size_ = 0;
-  int npu_id_;
-};
-
-class NPUPinnedAllocator : public SystemAllocator {
- public:
-  virtual void* Alloc(size_t* index, size_t size);
-  virtual void Free(void* p, size_t size, size_t index);
-  virtual bool UseGpu() const;
-
- private:
-  size_t npu_pinnd_alloc_size_ = 0;
-};
-#endif
-
 #ifdef PADDLE_WITH_MLU
 class MLUAllocator : public SystemAllocator {
  public:
diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc
index 4749ff3f8adb7..d20e3a1d6c9d0 100644
--- a/paddle/fluid/memory/allocation/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -83,14 +83,6 @@ TEST(GPUAllocator, AllocFailure) {
 }
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-TEST(NPUAllocator, Alloc) {
-  paddle::memory::detail::NPUAllocator a(0);
-  TestAllocator(&a, 1 << 20);
-  TestAllocator(&a, 1);
-}
-#endif
-
 #ifdef PADDLE_WITH_MLU
 TEST(MLUAllocator, Alloc) {
   paddle::memory::detail::MLUAllocator a(0);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index f3fb1fdf5ab55..0c5a0fef7172a 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -260,415 +260,6 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
 
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-template <>
-void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  platform::SetNPUDeviceId(dst_place.device);
-
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
-
-  if (stream) {
-    platform::RecordEvent record_event(
-        "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst,
-                             src,
-                             num,
-                             ACL_MEMCPY_HOST_TO_DEVICE,
-                             reinterpret_cast<aclrtStream>(stream));
-  } else {
-    // On NPU, async operation after sync operation is ok, while sync operation
-    // after async is not ok, since the async operation may not done.
-    // So, its needed to do wait before sync operation.
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
-
-    platform::RecordEvent record_event(
-        "NpuMemcpySync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
-  }
-}
-
-template <>
-void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::NPUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  platform::SetNPUDeviceId(src_place.device);
-
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
-
-  if (stream) {
-    platform::RecordEvent record_event(
-        "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst,
-                             src,
-                             num,
-                             ACL_MEMCPY_DEVICE_TO_HOST,
-                             reinterpret_cast<aclrtStream>(stream));
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
-
-    platform::RecordEvent record_event(
-        "NpuMemcpySync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
-  }
-}
-
-template <>
-void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::NPUPlace src_place,
-                                                  const void* src,
-                                                  size_t num,
-                                                  void* stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by stream(" << stream << ")";
-  if (dst_place == src_place) {
-    platform::SetNPUDeviceId(src_place.device);
-    if (stream) {
-      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::NPUMemcpyAsync(dst,
-                               src,
-                               num,
-                               ACL_MEMCPY_DEVICE_TO_DEVICE,
-                               reinterpret_cast<aclrtStream>(stream));
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
-
-      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
-    }
-  } else {
-    if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Peer access between NPU places is not allowed."));
-    }
-    if (stream) {
-      // TODO(zhiqiu): support peer access?
-      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::NPUMemcpyAsync(dst,
-                               src,
-                               num,
-                               ACL_MEMCPY_DEVICE_TO_DEVICE,
-                               reinterpret_cast<aclrtStream>(stream));
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
-
-      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU",
-                                         platform::TracerEventType::UserDefined,
-                                         1);
-      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
-    }
-  }
-}
-
-template <>
-void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
-    platform::CPUPlace dst_place,
-    void* dst,
-    platform::NPUPinnedPlace src_place,
-    const void* src,
-    size_t num) {
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place;
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-
-template <>
-void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
-    platform::NPUPinnedPlace dst_place,
-    void* dst,
-    platform::CPUPlace src_place,
-    const void* src,
-    size_t num) {
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place;
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-
-template <>
-void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
-    platform::NPUPinnedPlace dst_place,
-    void* dst,
-    platform::NPUPinnedPlace src_place,
-    const void* src,
-    size_t num) {
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place;
-  if (UNLIKELY(num == 0)) return;
-  std::memcpy(dst, src, num);
-}
-
-template <>
-void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
-    platform::NPUPinnedPlace dst_place,
-    void* dst,
-    platform::NPUPlace src_place,
-    const void* src,
-    size_t num,
-    void* stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  platform::SetNPUDeviceId(src_place.device);
-
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
-
-  if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::NPUMemcpyAsync(dst,
-                             src,
-                             num,
-                             ACL_MEMCPY_DEVICE_TO_HOST,
-                             reinterpret_cast<aclrtStream>(stream));
-  } else {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
-
-    platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
-  }
-}
-
-template <>
-void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
-    platform::NPUPlace dst_place,
-    void* dst,
-    platform::NPUPinnedPlace src_place,
-    const void* src,
-    size_t num,
-    void* stream) {
-  if (UNLIKELY(num == 0)) return;
-
-  platform::SetNPUDeviceId(dst_place.device);
-
-  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
-
-  if (stream) {
-    platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::NPUMemcpyAsync(dst,
-                             src,
-                             num,
-                             ACL_MEMCPY_HOST_TO_DEVICE,
-                             reinterpret_cast<aclrtStream>(stream));
-  } else {
-    // On NPU, async operation after sync operation is ok, while sync operation
-    // after async is not ok, since the async operation may not done.
-    // So, its needed to do wait before sync operation.
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
-
-    platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU",
-                                       platform::TracerEventType::UserDefined,
-                                       1);
-    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
-  }
-}
-
-// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
-template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place,
-                                  void* dst,
-                                  phi::Place src_place,
-                                  const void* src,
-                                  size_t num,
-                                  aclrtStream stream) {
-  if (src_place.GetType() == phi::AllocationType::CPU &&
-      dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CPUPlace place_dst, place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&
-             dst_place.GetType() == phi::AllocationType::NPU) {
-    platform::NPUPlace place_dst(dst_place.GetDeviceId());
-    platform::CPUPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::NPU &&
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::NPUPlace place_src(src_place.GetDeviceId());
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::NPU &&
-             dst_place.GetType() == phi::AllocationType::NPU) {
-    platform::NPUPlace place_src(src_place.GetDeviceId());
-    platform::NPUPlace place_dst(dst_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&
-             dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    platform::CPUPlace place_src;
-    platform::NPUPinnedPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CPUPlace place_dst;
-    platform::NPUPinnedPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    platform::NPUPinnedPlace place_dst;
-    platform::NPUPinnedPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::NPU) {
-    platform::NPUPinnedPlace place_src;
-    platform::NPUPlace place_dst(dst_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::NPU &&
-             dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    platform::NPUPinnedPlace place_dst;
-    platform::NPUPlace place_src(src_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num, stream);
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-  } else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CUSTOM) {
-    platform::CPUPlace place_src;
-    platform::CustomPlace place_dst(dst_place);
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    platform::CustomPlace place_src(src_place);
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num, stream);
-  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&  // NOLINT
-             dst_place.GetType() == phi::AllocationType::CUSTOM) {
-    platform::CustomPlace place_src(src_place);
-    platform::CustomPlace place_dst(dst_place);
-    return Copy(place_dst, dst, place_src, src, num, stream);
-#endif
-  }
-}
-
-// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
-template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     aclrtStream stream) {
-  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
-}
-
-// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
-template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
-                                     void* dst,
-                                     phi::CPUPlace src_place,
-                                     const void* src,
-                                     size_t num,
-                                     aclrtStream stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
-}
-
-// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
-template <>
-void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place,
-                                     void* dst,
-                                     phi::Place src_place,
-                                     const void* src,
-                                     size_t num,
-                                     aclrtStream stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
-       dst,
-       src_place,
-       src,
-       num,
-       stream);
-}
-
-// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
-template <>
-void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place,
-                                     void* dst,
-                                     phi::NPUPlace src_place,
-                                     const void* src,
-                                     size_t num,
-                                     aclrtStream stream) {
-  Copy(dst_place,
-       dst,
-       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src,
-       num,
-       stream);
-}
-
-// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
-template <>
-void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst,
-                                           phi::Place src_place,
-                                           const void* src,
-                                           size_t num,
-                                           aclrtStream stream) {
-  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
-}
-
-// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
-template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
-                                           void* dst,
-                                           phi::NPUPinnedPlace src_place,
-                                           const void* src,
-                                           size_t num,
-                                           aclrtStream stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
-}
-
-// NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
-template <>
-void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst,
-                                           phi::Place src_place,
-                                           const void* src,
-                                           size_t num) {
-  Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
-}
-
-// NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
-template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
-                                           void* dst,
-                                           phi::NPUPinnedPlace src_place,
-                                           const void* src,
-                                           size_t num) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
-}
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
@@ -1391,18 +982,6 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
     std::memcpy(dst, src, num);
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
-           dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    std::memcpy(dst, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::CPU) {
-    std::memcpy(dst, src, num);
-  } else if (src_place.GetType() == phi::AllocationType::NPUPINNED &&
-             dst_place.GetType() == phi::AllocationType::NPUPINNED) {
-    std::memcpy(dst, src, num);
-  }
-#endif
 #ifdef PADDLE_WITH_XPU
   else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
            dst_place.GetType() == phi::AllocationType::CPU) {
@@ -1488,8 +1067,7 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
 }
 
 #if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) &&     \
-    !defined(PADDLE_WITH_MLU)
+    !defined(PADDLE_WITH_HIP)
 
 template <>
 void Copy<phi::Place, phi::Place>(phi::Place dst_place,
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 6884ded10cd84..3899468297cf6 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -21,8 +21,7 @@
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
+
 #include "paddle/fluid/framework/convert_utils.h"
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
index d0b20a2f08066..f6f7eb31cb8e6 100644
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -148,16 +148,4 @@ TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
   ctx.PartialInitWithAllocator();
   Compare2<float>(&scope, ctx, "copy_cross_scope");
 }
-#elif PADDLE_WITH_ASCEND_CL
-TEST(copy_cross_scope, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare1<float>(&scope, ctx, "copy_cross_scope");
-}
-
-TEST(copy_cross_scope_to_main_scope, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare2<float>(&scope, ctx, "copy_cross_scope");
-}
 #endif
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 639d376485b4b..37b00eda81822 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -28,15 +28,9 @@ function(detection_library TARGET_NAME)
       PARENT_SCOPE)
 endfunction()
 
-if(WITH_ASCEND_CL)
-  detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op_npu.cc)
-  detection_library(density_prior_box_op SRCS density_prior_box_op.cc
-                    density_prior_box_op.cu density_prior_box_op_npu.cc)
-else()
-  detection_library(box_coder_op SRCS box_coder_op.cc)
-  detection_library(density_prior_box_op SRCS density_prior_box_op.cc
-                    density_prior_box_op.cu)
-endif()
+detection_library(box_coder_op SRCS box_coder_op.cc)
+detection_library(density_prior_box_op SRCS density_prior_box_op.cc
+                  density_prior_box_op.cu)
 
 if(WITH_XPU)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
@@ -49,11 +43,6 @@ elseif(WITH_MLU)
                     iou_similarity_op_mlu.cc)
   detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
   detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
-elseif(WITH_ASCEND_CL)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
-                    iou_similarity_op_npu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
-  detection_library(yolo_box_op SRCS yolo_box_op.cc)
 else()
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op.cu)
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 6d6739eed6702..8ff69a537ff7f 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -36,13 +36,6 @@ inline std::vector<int> get_expand_times(
           *expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
       expand_data = cpu_expand_tensor.data<int>();
     }
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (platform::is_npu_place(expand_tensor->place())) {
-      paddle::framework::TensorCopySync(
-          *expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
-      expand_data = cpu_expand_tensor.data<int>();
-    }
-#endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(expand_tensor->place())) {
       paddle::framework::TensorCopySync(
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
index 4343d42c2ccfc..0770dba0a44ad 100644
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -37,13 +37,6 @@ inline std::vector<int> get_expand_shape(
           *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
       shape_data = cpu_shape_tensor.data<int>();
     }
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (platform::is_npu_place(shape_tensor->place())) {
-      paddle::framework::TensorCopySync(
-          *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-      shape_data = cpu_shape_tensor.data<int>();
-    }
-#endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(shape_tensor->place())) {
       paddle::framework::TensorCopySync(
@@ -75,13 +68,6 @@ inline std::vector<int> get_expand_shape(
         paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_epxand_shape.push_back(*temp.data<int32_t>());
       }
-#ifdef PADDLE_WITH_ASCEND_CL
-      else if (platform::is_npu_place(tensor->place())) {  // NOLINT
-        phi::DenseTensor temp;
-        paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_shape.push_back(*temp.data<int32_t>());
-      }
-#endif
 #ifdef PADDLE_WITH_XPU
       else if (platform::is_xpu_place(tensor->place())) {  // NOLINT
         phi::DenseTensor temp;
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 7fbdbfd6d41fe..61cc7dc9f4b64 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,10 +1,3 @@
-if(WITH_ASCEND_CL)
-  cc_library(
-    beam_search_npu
-    SRCS beam_search_npu.cc
-    DEPS npu_op_runner)
-endif()
-
 if(WITH_XPU)
   cc_library(
     beam_search_xpu
@@ -13,9 +6,7 @@ if(WITH_XPU)
 endif()
 
 # please add new math_library in alphabetical order
-if(WITH_ASCEND_CL)
-  math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner)
-elseif(WITH_MLU)
+if(WITH_MLU)
   math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
 else()
   math_library(concat_and_split DEPS concat_and_split_functor)
diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc
index 06af45d48506a..60d7a6ee14ba7 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -122,34 +122,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     MemcpyD2HInferShapeFunctor);
 
-#ifdef PADDLE_WITH_ASCEND_CL
-REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_d2h,
-                               float,
-                               ops::MemcpyD2HKernel,
-                               double,
-                               ops::MemcpyD2HKernel,
-                               int8_t,
-                               ops::MemcpyD2HKernel,
-                               uint8_t,
-                               ops::MemcpyD2HKernel,
-                               int,
-                               ops::MemcpyD2HKernel,
-                               int64_t,
-                               ops::MemcpyD2HKernel,
-                               bool,
-                               ops::MemcpyD2HKernel,
-                               paddle::platform::bfloat16,
-                               ops::MemcpyD2HKernel,
-                               paddle::platform::complex<float>,
-                               ops::MemcpyD2HKernel,
-                               paddle::platform::complex<double>,
-                               ops::MemcpyD2HKernel,
-                               plat::float16,
-                               ops::MemcpyD2HKernel,
-                               int16_t,
-                               ops::MemcpyD2HKernel);
-#endif
-
 #ifdef PADDLE_WITH_IPU
 REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_d2h,
                                float,
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 9754628b1b8eb..b9debd5e67a26 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -87,11 +87,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetAttrMap(this->Attrs());
     op->SetInput("X", this->Input("X"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-#ifndef PADDLE_WITH_ASCEND_CL
     op->SetInput("Norm", this->Output("Norm"));
-#else
-    op->SetInput("Out", this->Output("Out"));
-#endif
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
   }
 };
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index c0c05e9e0ba90..8a1d681766fab 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -25,9 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
-
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index c2c61f06446b8..5d02136a80375 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -248,31 +248,6 @@ void EmplaceDeviceContexts(
       PADDLE_THROW(
           platform::errors::Unimplemented("IPUPlace is not supported. Please "
                                           "re-compile with WITH_IPU option."));
-#endif
-    } else if (platform::is_npu_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUDeviceContext>(
-          place_to_device_context,
-          place,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "NPUPlace is not supported. Please "
-          "re-compile with WITH_ASCEND_CL option."));
-#endif
-    } else if (platform::is_npu_pinned_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUPinnedDeviceContext>(
-          place_to_device_context,
-          place,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "NPUPinnedPlace is not supported. Please re-compile with "
-          "WITH_ASCEND_CL "
-          "option."));
 #endif
     }
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 67b6ab8f724cb..3c8ec21adbed8 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -68,8 +68,6 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#endif
 
 #include "paddle/phi/backends/device_ext.h"
 #include "paddle/phi/backends/stream.h"
@@ -89,10 +87,6 @@ struct GpuDevice;
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "acl/acl.h"
-#endif
-
 namespace paddle {
 namespace platform {
 
@@ -150,86 +144,6 @@ namespace xpu = baidu::xpu::api;
 using XPUDeviceContext = phi::XPUContext;
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-class NPUDeviceContext
-    : public DeviceContext,
-      public phi::TypeInfoTraits<DeviceContext, NPUDeviceContext> {
- public:
-  explicit NPUDeviceContext(NPUPlace place);
-  virtual ~NPUDeviceContext();
-  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  const Place& GetPlace() const override;
-  aclrtContext context() const;
-
-  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const override;
-
-  /*! \brief  Return npu stream in the device context. */
-  aclrtStream stream() const;
-
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    return stream_->AddCallback(callback);
-  }
-
-  void WaitStreamCallback() const { return stream_->WaitCallback(); }
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-  /*! \brief  Return hccl communicators. */
-  HcclComm hccl_comm() const { return hccl_comm_; }
-
-  /*! \brief  Set hccl communicators. */
-  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
-#endif
-
-  // template <typename Callback>
-  // void AddStreamCallback(Callback&& callback) const {
-  //   return stream_->AddCallback(callback);
-  // }
-
-  // void WaitStreamCallback() const { return stream_->WaitCallback(); }
-
-  static const char* name() { return "NPUDeviceContext"; }
-
- private:
-  NPUPlace place_;
-  aclrtContext context_;
-
-#ifdef PADDLE_WITH_ASCEND_CL
-  // HCCLContext_t hccl_context_;
-  HcclComm hccl_comm_{nullptr};
-#endif
-
-  // Need to be the same with other DeviceContext,
-  // Eventhough eigen_device_ is not used in NPU
-  // NOTE(zhiqiu): why need?
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-  std::shared_ptr<stream::NPUStream> stream_;
-
-  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
-};
-
-// Currently, NPUPinnedDeviceContext is only used to data copying.
-class NPUPinnedDeviceContext
-    : public DeviceContext,
-      public phi::TypeInfoTraits<DeviceContext, NPUPinnedDeviceContext> {
- public:
-  NPUPinnedDeviceContext();
-  explicit NPUPinnedDeviceContext(NPUPinnedPlace place);
-
-  const Place& GetPlace() const override;
-
-  Eigen::DefaultDevice* eigen_device() const;
-
-  static const char* name() { return "NPUPinnedDeviceContext"; }
-
- private:
-  NPUPinnedPlace place_;
-  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
-};
-
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
 #endif
@@ -264,18 +178,6 @@ template <>
 struct DefaultDeviceContextType<phi::MLUPlace>;
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-template <>
-struct DefaultDeviceContextType<phi::NPUPlace> {
-  using TYPE = paddle::platform::NPUDeviceContext;
-};
-
-template <>
-struct DefaultDeviceContextType<phi::NPUPinnedPlace> {
-  using TYPE = paddle::platform::NPUPinnedDeviceContext;
-};
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
 struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 8659d8be902b6..dc40fbe186e88 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -38,12 +38,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA)
 USE_EVENT_WAIT(kCPU, kCUDA)
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-USE_EVENT(kNPU);
-USE_EVENT_WAIT(kNPU, kNPU)
-USE_EVENT_WAIT(kCPU, kNPU)
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 USE_EVENT(kCUSTOM_DEVICE);
 USE_EVENT_WAIT(kCUSTOM_DEVICE, kCUSTOM_DEVICE)
diff --git a/paddle/fluid/platform/device_event_npu.cc b/paddle/fluid/platform/device_event_npu.cc
deleted file mode 100644
index ba3ea8ffcda38..0000000000000
--- a/paddle/fluid/platform/device_event_npu.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_ASCEND_CL
-
-#include "paddle/fluid/platform/device_event_base.h"
-#include "paddle/fluid/platform/event.h"
-namespace paddle {
-namespace platform {
-struct NPUDeviceEventWrapper {
-  explicit NPUDeviceEventWrapper(const platform::Place& place) {
-    PADDLE_ENFORCE_EQ(
-        platform::is_npu_place(place),
-        true,
-        platform::errors::PreconditionNotMet(
-            "Required device shall be NPUPlace, but received %d. ", place));
-
-    device_id_ = place.device;
-    PADDLE_ENFORCE_GT(
-        device_id_,
-        -1,
-        platform::errors::PreconditionNotMet(
-            "Required DeviceOption.device_id > -1, but received %d. ",
-            device_id_));
-    inner_event_ = NpuEventResourcePool::Instance().New(device_id_);
-  }
-  std::shared_ptr<NpuEventObject> inner_event_;
-  int device_id_;
-};
-
-void DeviceEventCreateNPU(DeviceEvent* event,
-                          const platform::Place& place,
-                          unsigned int) {
-  event->InitEvent(std::make_shared<NPUDeviceEventWrapper>(place));
-}
-
-void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) {
-  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
-  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
-  PADDLE_ENFORCE_NOT_NULL(
-      npu_dev_ctx,
-      platform::errors::PreconditionNotMet(
-          "Failed to dynamic_cast context into NPUDeviceContext."));
-  NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream());
-}
-
-bool DeviceEventQueryNPU(const DeviceEvent* event) {
-  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
-  PADDLE_ENFORCE_NOT_NULL(
-      wrapper,
-      platform::errors::PreconditionNotMet(
-          "Failed to dynamic_cast event into NPUDeviceEventWrapper."));
-  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-  platform::NPUEventQuery(wrapper->inner_event_.get(), &status);
-  return ACL_EVENT_STATUS_COMPLETE == status;
-}
-
-void DeviceEventFinishNPU(const DeviceEvent* event) {
-  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
-  NPUEventSynchronize(wrapper->inner_event_.get());
-}
-
-void DeviceEventNPUWaitNPU(const DeviceEvent* event,
-                           const DeviceContext* context) {
-  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
-  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
-  PADDLE_ENFORCE_NOT_NULL(
-      npu_dev_ctx,
-      platform::errors::PreconditionNotMet(
-          "Failed to dynamic_cast context into NPUDeviceContext."));
-  NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get());
-}
-
-void DeviceEventCPUWaitNPU(const DeviceEvent* event,
-                           const DeviceContext* context) {
-  DeviceEventFinishNPU(event);
-}
-
-void DeviceEventSetFinishedNPU(const DeviceEvent* event) {
-  // do nothing
-}
-
-void EventResetNPU(const DeviceEvent* event) {
-  // do nothing
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-using ::paddle::platform::kCPU;
-using ::paddle::platform::kNPU;
-REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU)
-REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU)
-REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU)
-REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU)
-REGISTER_EVENT_SET_FINISHED_FUNCTION(
-    kNPU, paddle::platform::DeviceEventSetFinishedNPU)
-REGISTER_EVENT_WAIT_FUNCTION(kNPU,
-                             kNPU,
-                             paddle::platform::DeviceEventNPUWaitNPU)
-REGISTER_EVENT_WAIT_FUNCTION(kCPU,
-                             kNPU,
-                             paddle::platform::DeviceEventCPUWaitNPU)
-REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU)
-#endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index b64bf81dc0d05..afa689a3f904d 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -54,7 +54,6 @@ void* GetCUDADsoHandle() { return phi::dynload::GetCUDADsoHandle(); }
 void* GetWarpCTCDsoHandle() { return phi::dynload::GetWarpCTCDsoHandle(); }
 
 void* GetNCCLDsoHandle() { return phi::dynload::GetNCCLDsoHandle(); }
-void* GetHCCLDsoHandle() { return phi::dynload::GetHCCLDsoHandle(); }
 
 void* GetTensorRtDsoHandle() { return phi::dynload::GetTensorRtDsoHandle(); }
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 50714dfb302eb..10b985e0b2044 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -37,7 +37,6 @@ void* GetNVRTCDsoHandle();
 void* GetCUDADsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
-void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetLAPACKDsoHandle();
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index 497a6b3cb98c2..e9bea4d87f369 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
-    defined(PADDLE_WITH_CNCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
 #include <arpa/inet.h>
diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h
index 5bd81faafcc18..0766e2e91f862 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||          \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \
-    defined(PADDLE_WITH_CNCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
 #include <functional>
 #include <memory>
 #include <mutex>
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index cb99a60bd6e44..43d7c61668701 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -187,17 +187,6 @@ void InitDevices() {
       LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
     }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-    // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
-    platform::AclInstance::Instance();  // NOLINT
-    try {
-      // use user specified XPUs in single-node multi-process mode.
-      devices = platform::GetSelectedNPUDevices();
-    } catch (const std::exception &exp) {
-      LOG(WARNING) << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found "
-                      "in runtime.";
-    }
-#endif
 #ifdef PADDLE_WITH_IPU
     try {
       // use user specified IPUs.
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
deleted file mode 100644
index f64ed106bd730..0000000000000
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ /dev/null
@@ -1,917 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <fcntl.h>
-
-#ifdef _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-
-#ifdef _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-
-#include <ge/ge_api.h>
-#include <graph/attr_value.h>
-#include <graph/operator_factory.h>
-
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/pybind/ascend_wrapper_py.h"
-
-using namespace ge;  // NOLINT
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-
-#ifdef PADDLE_WITH_ASCEND_STRING
-using AscendString = AscendString;
-#else
-using AscendString = std::string;
-#endif
-
-void BindAscendWrapper(py::module *m) {
-  py::class_<framework::AscendInstance,
-             std::shared_ptr<framework::AscendInstance>>(*m, "AscendInstance")
-      .def(py::init([]() { return framework::AscendInstance::GetInstance(); }))
-      .def("init_global_resources",
-           &framework::AscendInstance::InitGlobalResouces,
-           py::call_guard<py::gil_scoped_release>())
-      .def("destroy_global_resources",
-           &framework::AscendInstance::DestroyGlobalResouces,
-           py::call_guard<py::gil_scoped_release>())
-      .def("add_ascend_subgraph",
-           &framework::AscendInstance::AddAscendSubgraph,
-           py::call_guard<py::gil_scoped_release>());
-}
-
-std::map<AscendString, AscendString> convert_map(
-    const std::map<std::string, std::string> &options) {
-  std::map<AscendString, AscendString> rets;
-  for (auto &option : options) {
-    AscendString key = option.first.c_str();
-    AscendString val = option.second.c_str();
-    rets[key] = val;
-  }
-  return rets;
-}
-
-ge::Status ge_initialize(
-    std::map<std::string, std::string> &options) {  // NOLINT
-  py::gil_scoped_release release;
-  auto init_options = convert_map(options);
-  ge::Status res = ge::GEInitialize(init_options);
-  PADDLE_ENFORCE_EQ(
-      res,
-      ge::SUCCESS,
-      platform::errors::Fatal("ge initialize not success:%d", res));
-  py::gil_scoped_acquire acquire;
-  return res;
-}
-
-enum AttrType {
-  AT_INT64 = 0,
-  AT_INT32,
-  AT_UINT32,
-  AT_LIST_INT64,
-  AT_LIST_INT32,
-  AT_LIST_UINT32,
-  AT_FLOAT,
-  AT_LIST_FLOAT,
-  AT_ATTR_VALUE,
-  AT_STRING,
-  AT_LIST_STRING,
-  AT_BOOL,
-  AT_LIST_BOOL,
-  AT_TENSOR,
-  AT_LIST_TENSOR,
-  AT_LIST_UINT8,
-  AT_LIST_LIST_INT64,
-  AT_LIST_DT,
-  AT_DT,
-  AT_LIST_NAMEATTR,
-  AT_NAMEATTR
-};
-
-#ifdef PADDLE_WITH_ASCEND
-void BindAscendDevice(py::module *m) {
-  py::class_<platform::ascend::NPUDevice>(*m, "NPUDevice")
-      .def_static(
-          "get_device_count",
-          static_cast<int (*)()>(&platform::ascend::NPUDevice::GetDeviceCount));
-}
-#endif
-
-void BindAscendGraph(py::module *m) {
-  m->def("ge_initialize", &ge_initialize, "GEInitialize");
-  m->def("ge_finalize", &GEFinalize, "GEFinalize");
-
-  // enum
-  py::enum_<GraphRunMode>(*m, "GEGraphRunMode")
-      .value("PREDICTION", GraphRunMode::PREDICTION)
-      .value("TRAIN", GraphRunMode::TRAIN)
-      .export_values();
-
-  py::enum_<DataType>(*m, "GEDataType")
-      .value("DT_FLOAT", DataType::DT_FLOAT)
-      .value("DT_FLOAT16", DataType::DT_FLOAT16)
-      .value("DT_INT8", DataType::DT_INT8)
-      .value("DT_INT16", DataType::DT_INT16)
-      .value("DT_UINT16", DataType::DT_UINT16)
-      .value("DT_UINT8", DataType::DT_UINT8)
-      .value("DT_INT32", DataType::DT_INT32)
-      .value("DT_INT64", DataType::DT_INT64)
-      .value("DT_UINT32", DataType::DT_UINT32)
-      .value("DT_UINT64", DataType::DT_UINT64)
-      .value("DT_BOOL", DataType::DT_BOOL)
-      .value("DT_DOUBLE", DataType::DT_DOUBLE)
-      .value("DT_STRING", DataType::DT_STRING)
-      .value("DT_DUAL_SUB_INT8", DataType::DT_DUAL_SUB_INT8)
-      .value("DT_DUAL_SUB_UINT8", DataType::DT_DUAL_SUB_UINT8)
-      .value("DT_COMPLEX64", DataType::DT_COMPLEX64)
-      .value("DT_COMPLEX128", DataType::DT_COMPLEX128)
-      .value("DT_QINT8", DataType::DT_QINT8)
-      .value("DT_QINT16", DataType::DT_QINT16)
-      .value("DT_QINT32", DataType::DT_QINT32)
-      .value("DT_QUINT8", DataType::DT_QUINT8)
-      .value("DT_QUINT16", DataType::DT_QUINT16)
-      .value("DT_RESOURCE", DataType::DT_RESOURCE)
-      .value("DT_STRING_REF", DataType::DT_STRING_REF)
-      .value("DT_DUAL", DataType::DT_DUAL)
-      .value("DT_UNDEFINED", DataType::DT_UNDEFINED)
-      .export_values();
-
-  py::enum_<Format>(*m, "GEFormat")
-      .value("FORMAT_NCHW", Format::FORMAT_NCHW)
-      .value("FORMAT_NHWC", Format::FORMAT_NHWC)
-      .value("FORMAT_ND", Format::FORMAT_ND)
-      .value("FORMAT_NC1HWC0", Format::FORMAT_NC1HWC0)
-      .value("FORMAT_FRACTAL_Z", Format::FORMAT_FRACTAL_Z)
-      .value("FORMAT_NC1C0HWPAD", Format::FORMAT_NC1C0HWPAD)
-      .value("FORMAT_NHWC1C0", Format::FORMAT_NHWC1C0)
-      .value("FORMAT_FSR_NCHW", Format::FORMAT_FSR_NCHW)
-      .value("FORMAT_FRACTAL_DECONV", Format::FORMAT_FRACTAL_DECONV)
-      .value("FORMAT_C1HWNC0", Format::FORMAT_C1HWNC0)
-      .value("FORMAT_FRACTAL_DECONV_TRANSPOSE",
-             Format::FORMAT_FRACTAL_DECONV_TRANSPOSE)
-      .value("FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS",
-             Format::FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS)
-      .value("FORMAT_NC1HWC0_C04", Format::FORMAT_NC1HWC0_C04)
-      .value("FORMAT_FRACTAL_Z_C04", Format::FORMAT_FRACTAL_Z_C04)
-      .value("FORMAT_CHWN", Format::FORMAT_CHWN)
-      .value("FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS",
-             Format::FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS)
-      .value("FORMAT_HWCN", Format::FORMAT_HWCN)
-      .value("FORMAT_NC1KHKWHWC0", Format::FORMAT_NC1KHKWHWC0)
-      .value("FORMAT_BN_WEIGHT", Format::FORMAT_BN_WEIGHT)
-      .value("FORMAT_FILTER_HWCK", Format::FORMAT_FILTER_HWCK)
-      .value("FORMAT_HASHTABLE_LOOKUP_LOOKUPS",
-             Format::FORMAT_HASHTABLE_LOOKUP_LOOKUPS)
-      .value("FORMAT_HASHTABLE_LOOKUP_KEYS",
-             Format::FORMAT_HASHTABLE_LOOKUP_KEYS)
-      .value("FORMAT_HASHTABLE_LOOKUP_VALUE",
-             Format::FORMAT_HASHTABLE_LOOKUP_VALUE)
-      .value("FORMAT_HASHTABLE_LOOKUP_OUTPUT",
-             Format::FORMAT_HASHTABLE_LOOKUP_OUTPUT)
-      .value("FORMAT_HASHTABLE_LOOKUP_HITS",
-             Format::FORMAT_HASHTABLE_LOOKUP_HITS)
-      .value("FORMAT_C1HWNCoC0", Format::FORMAT_C1HWNCoC0)
-      .value("FORMAT_MD", Format::FORMAT_MD)
-      .value("FORMAT_NDHWC", Format::FORMAT_NDHWC)
-      .value("FORMAT_FRACTAL_ZZ", Format::FORMAT_FRACTAL_ZZ)
-      .value("FORMAT_FRACTAL_NZ", Format::FORMAT_FRACTAL_NZ)
-      .value("FORMAT_NCDHW", Format::FORMAT_NCDHW)
-      .value("FORMAT_DHWCN", Format::FORMAT_DHWCN)
-      .value("FORMAT_NDC1HWC0", Format::FORMAT_NDC1HWC0)
-      .value("FORMAT_FRACTAL_Z_3D", Format::FORMAT_FRACTAL_Z_3D)
-      .value("FORMAT_CN", Format::FORMAT_CN)
-      .value("FORMAT_NC", Format::FORMAT_NC)
-      .value("FORMAT_DHWNC", Format::FORMAT_DHWNC)
-      .value("FORMAT_FRACTAL_Z_3D_TRANSPOSE",
-             Format::FORMAT_FRACTAL_Z_3D_TRANSPOSE)
-      .value("FORMAT_FRACTAL_ZN_LSTM", Format::FORMAT_FRACTAL_ZN_LSTM)
-      .value("FORMAT_FRACTAL_Z_G", Format::FORMAT_FRACTAL_Z_G)
-      .value("FORMAT_RESERVED", Format::FORMAT_RESERVED)
-      .value("FORMAT_ALL", Format::FORMAT_ALL)
-      .value("FORMAT_NULL", Format::FORMAT_NULL)
-      .export_values();
-
-  py::enum_<UnknowShapeOpType>(*m, "GEUnknowShapeOpType")
-      .value("DEPEND_IN_SHAPE", UnknowShapeOpType::DEPEND_IN_SHAPE)
-      .value("DEPEND_CONST_VALUE", UnknowShapeOpType::DEPEND_CONST_VALUE)
-      .value("DEPEND_SHAPE_RANGE", UnknowShapeOpType::DEPEND_SHAPE_RANGE)
-      .value("DEPEND_COMPUTE", UnknowShapeOpType::DEPEND_COMPUTE)
-      .export_values();
-
-  py::enum_<DeviceType>(*m, "GEDeviceType")
-      .value("NPU", DeviceType::NPU)
-      .value("CPU", DeviceType::CPU)
-      .export_values();
-
-  py::enum_<AttrType>(*m, "GEAttrType")
-      .value("AT_INT64", AttrType::AT_INT64)
-      .value("AT_INT32", AttrType::AT_INT32)
-      .value("AT_UINT32", AttrType::AT_UINT32)
-      .value("AT_LIST_INT64", AttrType::AT_LIST_INT64)
-      .value("AT_LIST_INT32", AttrType::AT_LIST_INT32)
-      .value("AT_LIST_UINT32", AttrType::AT_LIST_UINT32)
-      .value("AT_FLOAT", AttrType::AT_FLOAT)
-      .value("AT_LIST_FLOAT", AttrType::AT_LIST_FLOAT)
-      .value("AT_ATTR_VALUE", AttrType::AT_ATTR_VALUE)
-      .value("AT_STRING", AttrType::AT_STRING)
-      .value("AT_LIST_STRING", AttrType::AT_LIST_STRING)
-      .value("AT_BOOL", AttrType::AT_BOOL)
-      .value("AT_LIST_BOOL", AttrType::AT_LIST_BOOL)
-      .value("AT_TENSOR", AttrType::AT_TENSOR)
-      .value("AT_LIST_TENSOR", AttrType::AT_LIST_TENSOR)
-      .value("AT_LIST_UINT8", AttrType::AT_LIST_UINT8)
-      .value("AT_LIST_LIST_INT64", AttrType::AT_LIST_LIST_INT64)
-      .value("AT_LIST_DT", AttrType::AT_LIST_DT)
-      .value("AT_DT", AttrType::AT_DT)
-      .value("AT_LIST_NAMEATTR", AttrType::AT_LIST_NAMEATTR)
-      .value("AT_NAMEATTR", AttrType::AT_NAMEATTR)
-      .export_values();
-
-  // 类封装
-  py::class_<Session>(*m, "GESession")
-      .def(py::init([](const std::map<std::string, std::string> &options) {
-        return std::unique_ptr<ge::Session>(
-            new ge::Session(convert_map(options)));
-      }))
-      .def(
-          "add_graph",
-          (ge::Status(Session::*)(uint32_t, const Graph &)) & Session::AddGraph)
-      .def("add_graph",
-           [](Session &ss,
-              uint32_t index,
-              const Graph &graph,
-              const std::map<std::string, std::string> &options) {
-             return ss.AddGraph(index, graph, convert_map(options));
-           })
-      .def("remove_graph", &Session::RemoveGraph)
-      .def(
-          "run_graph",
-          [](Session &ss,
-             uint32_t graphId,
-             const std::vector<Tensor> &inputs) -> py::tuple {
-            std::vector<Tensor> outputs;
-            ge::Status res = ss.RunGraph(graphId, inputs, outputs);
-            return py::make_tuple(outputs, res);
-          },
-          py::call_guard<py::gil_scoped_release>())
-      .def("build_graph", &Session::BuildGraph)
-      .def("run_graph_async", &Session::RunGraphAsync)
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("register_call_back_func",
-           static_cast<ge::Status (ge::Session::*)(  // NOLINT
-               const char *,
-               const ge::session::pCallBackFunc &)>(
-               &ge::Session::RegisterCallBackFunc))
-#else
-      .def("register_call_back_func",
-           (Status (Session::*)(  // NOLINT
-               const std::string &,
-               std::function<uint32_t(
-                   uint32_t graph_id,
-                   const std::map<std::string, ge::Tensor> &params_list)>)) &
-               Session::RegisterCallBackFunc)
-#endif
-      .def("is_graph_need_rebuild", &Session::IsGraphNeedRebuild);
-
-  py::class_<Graph>(*m, "GEGraph")
-      .def(py::init<>())
-      .def(py::init<const char *>())
-      .def("set_inputs", &Graph::SetInputs)
-      .def("set_outputs",
-           (Graph & (Graph::*)(const std::vector<Operator> &)) &
-               Graph::SetOutputs)
-      .def("set_outputs",
-           (Graph & (Graph::*)(const std::vector<
-                               std::pair<Operator, std::vector<size_t>>> &)) &
-               Graph::SetOutputs)
-      .def("set_outputs",
-           (Graph &
-            (Graph::*)(const std::vector<std::pair<ge::Operator, AscendString>>
-                           &)) &
-               Graph::SetOutputs)
-      .def("set_targets", &Graph::SetTargets)
-      .def("is_valid", &Graph::IsValid)
-      .def("add_op", &Graph::AddOp)
-      .def("find_op_by_name",
-           [](Graph &graph, const char *name) -> py::tuple {
-             ge::Operator op;
-             graphStatus status = graph.FindOpByName(name, op);
-             return py::make_tuple(op, status);
-           })
-      .def("find_op_by_type",
-           [](Graph &graph, const char *type) -> py::tuple {
-             std::vector<ge::Operator> ops;
-             graphStatus status = graph.FindOpByType(type, ops);
-             return py::make_tuple(ops, status);
-           })
-      .def("get_all_op_name",
-           [](Graph &graph) -> py::tuple {
-             std::vector<AscendString> op_name;
-             graphStatus status = graph.GetAllOpName(op_name);
-             return py::make_tuple(op_name, status);
-           })
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("save_to_file",
-           static_cast<ge::graphStatus (ge::Graph::*)(const char *) const>(
-               &ge::Graph::SaveToFile))
-      .def("load_from_file",
-           static_cast<ge::graphStatus (ge::Graph::*)(const char *)>(
-               &Graph::LoadFromFile))
-      .def("get_name",
-           static_cast<ge::graphStatus (ge::Graph::*)(AscendString &) const>(
-               &Graph::GetName))
-#else
-      .def("save_to_file", &Graph::SaveToFile)
-      .def("load_from_file", &Graph::LoadFromFile)
-      .def("get_name", &Graph::GetName)
-#endif
-      .def("set_need_iteration", &Graph::SetNeedIteration);
-
-  py::class_<Operator>(*m, "GEOperator")
-      .def(py::init<>())
-      .def(py::init<const char *>())
-      .def(py::init<const char *, const char *>())
-      .def("is_empty", &Operator::IsEmpty)
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("get_name",
-           static_cast<ge::graphStatus (ge::Operator::*)(AscendString &) const>(
-               &Operator::GetName))
-      .def("get_op_type",
-           static_cast<ge::graphStatus (ge::Operator::*)(AscendString &) const>(
-               &Operator::GetOpType))
-      .def("set_input",
-           (Operator & (Operator::*)(const char *, const Operator &)) &
-               Operator::SetInput)
-      .def("set_input",
-           (Operator &
-            (Operator::*)(const char *, const Operator &, const char *)) &
-               Operator::SetInput)
-      .def(
-          "set_input",
-          (Operator & (Operator::*)(const char *, const Operator &, uint32_t)) &
-              Operator::SetInput)
-#else
-      .def("get_name", &Operator::GetName)
-      .def("get_op_type", &Operator::GetOpType)
-      .def("set_input",
-           (Operator & (Operator::*)(const std::string &, const Operator &)) &
-               Operator::SetInput)
-      .def("set_input",
-           (Operator & (Operator::*)(const std::string &, const Operator &,
-                                     const std::string &)) &
-               Operator::SetInput)
-      .def("set_input", (Operator & (Operator::*)(const std::string &,
-                                                  const Operator &, uint32_t)) &
-                            Operator::SetInput)
-#endif
-      .def("add_control_input", &Operator::AddControlInput)
-      .def("get_input_const_data",
-           [](Operator &op, const char *dst_name) -> py::tuple {
-             Tensor data;
-             graphStatus res = op.GetInputConstData(dst_name, data);
-             return py::make_tuple(data, res);
-           })
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("get_input_desc",
-           (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetInputDesc)
-      .def("get_input_desc",
-           [](Operator &op, const std::string &name) {
-             return op.GetInputDescByName(name.c_str());
-           })
-      .def("get_dynamic_output_num",
-           static_cast<int (ge::Operator::*)(const char *) const>(
-               &Operator::GetDynamicOutputNum))
-      .def("get_dynamic_input_num",
-           static_cast<int (ge::Operator::*)(const char *) const>(
-               &Operator::GetDynamicInputNum))
-#else
-      .def("get_input_desc",
-           (TensorDesc (Operator::*)(const std::string &) const) &
-               Operator::GetInputDesc)
-      .def("get_input_desc",
-           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
-      .def("get_dynamic_output_num", &Operator::GetDynamicOutputNum)
-      .def("get_dynamic_input_num", &Operator::GetDynamicInputNum)
-#endif
-      .def("try_get_input_desc",
-           [](Operator &op, const char *name) -> py::tuple {
-             TensorDesc tensor_desc;
-             graphStatus status = op.TryGetInputDesc(name, tensor_desc);
-             return py::make_tuple(tensor_desc, status);
-           })
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("update_input_desc",
-           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
-               const char *,
-               const TensorDesc &)>(&Operator::UpdateInputDesc))
-      .def("get_output_desc",
-           [](Operator &op, const std::string &name) {
-             return op.GetOutputDescByName(name.c_str());
-           })
-      .def("get_output_desc",
-           (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
-      .def("update_output_desc",
-           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
-               const char *,
-               const TensorDesc &)>(&Operator::UpdateOutputDesc))
-      .def("get_dynamic_input_desc",
-           static_cast<ge::TensorDesc (ge::Operator::*)(const char *, uint32_t)
-                           const>(&Operator::GetDynamicInputDesc))
-      .def("update_dynamic_input_desc",
-           static_cast<ge::graphStatus (ge::Operator::*)(
-               const char *, uint32_t, const TensorDesc &)>(
-               &Operator::UpdateDynamicInputDesc))
-      .def("get_dynamic_output_desc",
-           static_cast<ge::TensorDesc (ge::Operator::*)(const char *, uint32_t)
-                           const>(&Operator::GetDynamicOutputDesc))
-      .def("update_dynamic_output_desc",
-           static_cast<ge::graphStatus (ge::Operator::*)(
-               const char *, uint32_t, const TensorDesc &)>(
-               &Operator::UpdateDynamicOutputDesc))
-#else
-      .def("update_input_desc", &Operator::UpdateInputDesc)
-      .def("get_output_desc",
-           (TensorDesc (Operator::*)(const std::string &) const) &
-               Operator::GetOutputDesc)
-      .def("get_output_desc",
-           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
-      .def("update_output_desc", &Operator::UpdateOutputDesc)
-      .def("get_dynamic_input_desc", &Operator::GetDynamicInputDesc)
-      .def("update_dynamic_input_desc", &Operator::UpdateDynamicInputDesc)
-      .def("get_dynamic_output_desc", &Operator::GetDynamicOutputDesc)
-      .def("update_dynamic_output_desc", &Operator::UpdateDynamicOutputDesc)
-#endif
-      .def("infer_shape_and_type", &Operator::InferShapeAndType)
-      .def("set_inference_context", &Operator::SetInferenceContext)
-      .def("get_inference_context", &Operator::GetInferenceContext)
-      .def("verify_all_attr", &Operator::VerifyAllAttr)
-      .def("get_inputs_size", &Operator::GetInputsSize)
-      .def("get_outputs_size", &Operator::GetOutputsSize)
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("get_all_attr_names_and_types",
-           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
-               std::map<AscendString, AscendString> &) const>(
-               &Operator::GetAllAttrNamesAndTypes))
-#else
-      .def("get_all_attr_names_and_types", &Operator::GetAllAttrNamesAndTypes)
-#endif
-      .def("set_attr_int64",
-           [](Operator &op, const char *name, int64_t value) -> Operator & {
-             int64_t tar = (int64_t)value;
-             return op.SetAttr(name, tar);
-           })
-      .def("set_attr_int32",
-           [](Operator &op, const char *name, int32_t value) -> Operator & {
-             int32_t tar = (int32_t)value;
-             return op.SetAttr(name, tar);
-           })
-      .def("set_attr_uint32",
-           [](Operator &op, const char *name, uint32_t value) -> Operator & {
-             uint32_t tar = (uint32_t)value;
-             return op.SetAttr(name, tar);
-           })
-      .def("set_attr_vec_int64",
-           [](Operator &op,
-              const char *name,
-              const std::vector<int64_t> &value) -> Operator & {
-             int len = value.size();
-             std::vector<int64_t> tar;
-             int64_t tmp;
-             for (int i = 0; i < len; i++) {
-               tmp = (int64_t)value[i];
-               tar.push_back(tmp);
-             }
-             return op.SetAttr(name, tar);
-           })
-      .def("set_attr_vec_int32",
-           [](Operator &op,
-              const char *name,
-              const std::vector<int32_t> &value) -> Operator & {
-             int len = value.size();
-             std::vector<int32_t> tar;
-             int32_t tmp;
-             for (int i = 0; i < len; i++) {
-               tmp = (int32_t)value[i];
-               tar.push_back(tmp);
-             }
-             return op.SetAttr(name, tar);
-           })
-      .def("set_attr_vec_uint32",
-           [](Operator &op,
-              const char *name,
-              const std::vector<uint32_t> &value) -> Operator & {
-             int len = value.size();
-             std::vector<uint32_t> tar;
-             uint32_t tmp;
-             for (int i = 0; i < len; i++) {
-               tmp = (uint32_t)value[i];
-               tar.push_back(tmp);
-             }
-             return op.SetAttr(name, tar);
-           })
-      .def("set_attr_list_int64",
-           [](Operator &op,
-              const char *name,
-              std::initializer_list<int64_t> &attrValue) -> Operator & {
-             return op.SetAttr(name, std::move(attrValue));
-           })
-      .def("set_attr_attrvalue",
-           [](Operator &op, const char *name, AttrValue &attrValue)
-               -> Operator & { return op.SetAttr(name, std::move(attrValue)); })
-      .def("set_attr_float",
-           [](Operator &op, const char *name, float value) -> Operator & {
-             float tar = static_cast<float>(value);
-             return op.SetAttr(name, tar);
-           })
-      .def("set_attr_vec_float",
-           [](Operator &op,
-              const char *name,
-              const std::vector<float> &value) -> Operator & {
-             int len = value.size();
-             std::vector<float> tar;
-             float tmp;
-             for (int i = 0; i < len; i++) {
-               tmp = static_cast<float>(value[i]);
-               tar.push_back(tmp);
-             }
-             return op.SetAttr(name, tar);
-           })
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("set_attr_string",
-           (Operator & (Operator::*)(const char *, const char *)) &
-               Operator::SetAttr)
-      .def("set_attr_vec_string",
-           (Operator &
-            (Operator::*)(const char *, const std::vector<AscendString> &)) &
-               Operator::SetAttr)
-#else
-      .def("set_attr_string", (Operator & (Operator::*)(const std::string &,
-                                                        const std::string &)) &
-                                  Operator::SetAttr)
-      .def("set_attr_vec_string",
-           (Operator & (Operator::*)(const std::string &,
-                                     const std::vector<std::string> &)) &
-               Operator::SetAttr)
-#endif
-      .def("set_attr_bool",
-           [](Operator &op, const char *name, bool value) -> Operator & {
-             if (value)
-               return op.SetAttr(name, true);
-             else
-               return op.SetAttr(name, false);
-           })
-      .def("set_attr_vec_bool",
-           [](Operator &op,
-              const char *name,
-              const std::vector<bool> &value) -> Operator & {
-             int len = value.size();
-             std::vector<bool> tar;
-             for (int i = 0; i < len; i++) {
-               if (value[i])
-                 tar.push_back(true);
-               else
-                 tar.push_back(false);
-             }
-             return op.SetAttr(name, tar);
-           })
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("set_attr_tensor",
-           (Operator & (Operator::*)(const char *, const Tensor &)) &
-               Operator::SetAttr)
-      .def("set_attr_vec_tensor",
-           (Operator &
-            (Operator::*)(const char *, const std::vector<Tensor> &)) &
-               Operator::SetAttr)
-#else
-      .def("set_attr_tensor",
-           (Operator & (Operator::*)(const std::string &, const Tensor &)) &
-               Operator::SetAttr)
-      .def("set_attr_vec_tensor",
-           (Operator &
-            (Operator::*)(const std::string &, const std::vector<Tensor> &)) &
-               Operator::SetAttr)
-#endif
-      .def("set_attr_vec_uint8",
-           [](Operator &op,
-              const char *name,
-              const std::vector<uint8_t> &value) -> Operator & {
-             int len = value.size();
-             std::vector<uint8_t> tar;
-             uint8_t tmp;
-             for (int i = 0; i < len; i++) {
-               tmp = (uint8_t)value[i];
-               tar.push_back(tmp);
-             }
-             return op.SetAttr(name, tar);
-           })
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("set_attr_vec_vec_int64",
-           (Operator &
-            (Operator::*)(const char *,
-                          const std::vector<std::vector<int64_t>> &)) &
-               Operator::SetAttr)
-#else
-      .def("set_attr_vec_vec_int64",
-           (Operator &
-            (Operator::*)(const std::string &,
-                          const std::vector<std::vector<int64_t>> &)) &
-               Operator::SetAttr)
-#endif
-      .def("set_attr_vec_dtype",
-           [](Operator &op,
-              const char *name,
-              const std::vector<DataType> &value) -> Operator & {
-             int len = value.size();
-             std::vector<ge::DataType> tar;
-             ge::DataType tmp;
-             for (int i = 0; i < len; i++) {
-               tmp = (ge::DataType)value[i];
-               tar.push_back(tmp);
-             }
-             return op.SetAttr(name, tar);
-           })
-      .def("set_attr_dtype",
-           [](Operator &op,
-              const char *name,
-              const DataType &value) -> Operator & {
-             ge::DataType tar = (ge::DataType)value;
-             return op.SetAttr(name, tar);
-           })
-      .def("get_attr",
-           [](Operator &op, const char *name, AttrType type) -> py::tuple {
-             graphStatus res = -1;
-             switch (type) {
-               case AT_INT64: {
-                 int64_t i_64_av;
-                 res = op.GetAttr(name, i_64_av);
-                 return py::make_tuple(i_64_av, res);
-               } break;
-               case AT_INT32: {
-                 int32_t i_32_av;
-                 res = op.GetAttr(name, i_32_av);
-                 return py::make_tuple(i_32_av, res);
-               } break;
-               case AT_UINT32: {
-                 uint32_t ui_32_av;
-                 res = op.GetAttr(name, ui_32_av);
-                 return py::make_tuple(ui_32_av, res);
-               } break;
-               case AT_LIST_INT64: {
-                 std::vector<int64_t> v_i_64_av;
-                 res = op.GetAttr(name, v_i_64_av);
-                 return py::make_tuple(v_i_64_av, res);
-               } break;
-               case AT_LIST_INT32: {
-                 std::vector<int32_t> v_i_32_av;
-                 res = op.GetAttr(name, v_i_32_av);
-                 return py::make_tuple(v_i_32_av, res);
-               } break;
-               case AT_LIST_UINT32: {
-                 std::vector<uint32_t> v_ui_32_av;
-                 res = op.GetAttr(name, v_ui_32_av);
-                 return py::make_tuple(v_ui_32_av, res);
-               } break;
-               case AT_FLOAT: {
-                 float f_av;
-                 res = op.GetAttr(name, f_av);
-                 return py::make_tuple(f_av, res);
-               } break;
-               case AT_LIST_FLOAT: {
-                 std::vector<float> v_f_av;
-                 res = op.GetAttr(name, v_f_av);
-                 return py::make_tuple(v_f_av, res);
-               } break;
-               case AT_ATTR_VALUE: {
-                 AttrValue o_av;
-                 res = op.GetAttr(name, o_av);
-                 return py::make_tuple(o_av, res);
-               } break;
-               case AT_STRING: {
-                 AscendString s_av;
-                 res = op.GetAttr(name, s_av);
-                 return py::make_tuple(s_av, res);
-               } break;
-               case AT_LIST_STRING: {
-                 std::vector<AscendString> v_s_av;
-                 res = op.GetAttr(name, v_s_av);
-                 return py::make_tuple(v_s_av, res);
-               } break;
-               case AT_BOOL: {
-                 bool b_av;
-                 res = op.GetAttr(name, b_av);
-                 return py::make_tuple(b_av, res);
-               } break;
-               case AT_LIST_BOOL: {
-                 std::vector<bool> v_b_av;
-                 res = op.GetAttr(name, v_b_av);
-                 return py::make_tuple(v_b_av, res);
-               } break;
-               case AT_TENSOR: {
-                 Tensor t_av;
-                 res = op.GetAttr(name, t_av);
-                 return py::make_tuple(t_av, res);
-               } break;
-               case AT_LIST_TENSOR: {
-                 std::vector<Tensor> v_t_av;
-                 res = op.GetAttr(name, v_t_av);
-                 return py::make_tuple(v_t_av, res);
-               } break;
-               case AT_LIST_UINT8: {
-                 std::vector<uint8_t> v_ui_8_av;
-                 res = op.GetAttr(name, v_ui_8_av);
-                 return py::make_tuple(v_ui_8_av, res);
-               } break;
-               case AT_LIST_LIST_INT64: {
-                 std::vector<std::vector<int64_t>> v_v_i_64_av;
-                 res = op.GetAttr(name, v_v_i_64_av);
-                 return py::make_tuple(v_v_i_64_av, res);
-               } break;
-               case AT_DT: {
-                 ge::DataType dt_av;
-                 res = op.GetAttr(name, dt_av);
-                 return py::make_tuple(dt_av, res);
-               } break;
-               case AT_LIST_DT: {
-                 std::vector<ge::DataType> v_dt_av;
-                 res = op.GetAttr(name, v_dt_av);
-                 return py::make_tuple(v_dt_av, res);
-               } break;
-               default:
-                 return py::make_tuple(0, res);
-                 break;
-             }
-           })
-      .def("break_connect", &Operator::BreakConnect)
-      .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("get_subgraph_names",
-           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
-               std::vector<AscendString> &) const>(&Operator::GetSubgraphNames))
-      .def("get_subgraph_builder",
-           static_cast<ge::SubgraphBuilder (ge::Operator::*)(const char *)
-                           const>(&Operator::GetSubgraphBuilder))
-      .def("get_subgraph",
-           static_cast<ge::Graph (ge::Operator::*)(const char *) const>(
-               &Operator::GetSubgraph))
-      .def("get_dynamic_subgraph_builder",
-           static_cast<ge::SubgraphBuilder (ge::Operator::*)(const char *,
-                                                             uint32_t) const>(
-               &Operator::GetDynamicSubgraphBuilder))
-      .def("get_dynamic_subgraph",
-           static_cast<ge::Graph (ge::Operator::*)(const char *, uint32_t)
-                           const>(&Operator::GetDynamicSubgraph));
-#else
-      .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
-      .def("get_subgraph_names", &Operator::GetSubgraphNames)
-      .def("get_subgraph_builder", &Operator::GetSubgraphBuilder)
-      .def("get_subgraph", &Operator::GetSubgraph)
-      .def("get_dynamic_subgraph_builder", &Operator::GetDynamicSubgraphBuilder)
-      .def("get_dynamic_subgraph", &Operator::GetDynamicSubgraph);
-#endif
-
-  py::class_<Tensor>(*m, "GETensor")
-      .def(py::init<>())
-      .def(py::init<const TensorDesc &>())
-      .def(py::init<const TensorDesc &, const std::vector<uint8_t> &>())
-      .def(py::init<const TensorDesc &, const uint8_t *, size_t>())
-      .def("set_tensor_desc", &Tensor::SetTensorDesc)
-      .def("get_tensor_desc", &Tensor::GetTensorDesc)
-      // .def("set_data", (graphStatus(Tensor::*)(std::vector<uint8_t> &&)) &
-      // Tensor::SetData)
-      .def("set_data",
-           (graphStatus(Tensor::*)(const std::vector<uint8_t> &)) &
-               Tensor::SetData)
-      .def("set_data",
-           (graphStatus(Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("set_data", (graphStatus(Tensor::*)(const char *)) & Tensor::SetData)
-#else
-      .def("set_data",
-           (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData)
-#endif
-      .def("set_data",
-           (graphStatus(Tensor::*)(const std::vector<AscendString> &)) &
-               Tensor::SetData)
-
-      .def("get_data",
-           [](Tensor &ts) -> py::list {
-             py::list v_data;
-             uint8_t *data = ts.GetData();
-             size_t size = ts.GetSize();
-             for (size_t i = 0; i < size; ++i) {
-               v_data.append(data[i]);
-             }
-             return v_data;
-           })
-      .def("get_size", &Tensor::GetSize)
-      .def("is_valid", &Tensor::IsValid)
-      .def("clone", &Tensor::Clone);
-
-  py::class_<TensorDesc>(*m, "GETensorDesc")
-      .def(py::init<>())
-      .def(py::init<Shape, Format, DataType>(),
-           py::arg("shape"),
-           py::arg("format") = FORMAT_ND,
-           py::arg("dt") = DT_FLOAT)
-      .def(py::init<const TensorDesc &>())
-      .def("update",
-           (void(TensorDesc::*)(const Shape &, Format, DataType)) &
-               TensorDesc::Update,
-           py::arg("shape"),
-           py::arg("format") = FORMAT_ND,
-           py::arg("dt") = DT_FLOAT)
-      .def("set_shape", &TensorDesc::SetShape)
-      .def("get_shape", &TensorDesc::GetShape)
-      .def("set_unknown_dim_num_shape", &TensorDesc::SetUnknownDimNumShape)
-      .def("set_shape_range", &TensorDesc::SetShapeRange)
-      .def("get_shape_range",
-           [](TensorDesc &tensorDesc) -> py::tuple {
-             std::vector<std::pair<int64_t, int64_t>> range;
-             graphStatus status = tensorDesc.GetShapeRange(range);
-             return py::make_tuple(range, status);
-           })
-      .def("set_format", &TensorDesc::SetFormat)
-      .def("get_format", &TensorDesc::GetFormat)
-      .def("get_origin_shape", &TensorDesc::GetOriginShape)
-      .def("set_origin_shape", &TensorDesc::SetOriginShape)
-      .def("set_origin_format", &TensorDesc::SetOriginFormat)
-      .def("get_origin_format", &TensorDesc::GetOriginFormat)
-      .def("set_data_type", &TensorDesc::SetDataType)
-      .def("get_data_type", &TensorDesc::GetDataType)
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def("set_name",
-           static_cast<void (ge::TensorDesc::*)(const char *)>(
-               &TensorDesc::SetName))
-      .def("get_name",
-           static_cast<ge::graphStatus (ge::TensorDesc::*)(AscendString &)>(
-               &TensorDesc::GetName))
-#else
-      .def("set_name", &TensorDesc::SetName)
-      .def("get_name", &TensorDesc::GetName)
-#endif
-      .def("set_size", &TensorDesc::SetSize)
-      .def("get_size", &TensorDesc::GetSize)
-      .def("set_real_dim_cnt", &TensorDesc::SetRealDimCnt)
-      .def("get_real_dim_cnt", &TensorDesc::GetRealDimCnt);
-
-  py::class_<Shape>(*m, "GEShape")
-      .def(py::init<>())
-      .def(py::init<const std::vector<int64_t> &>())
-      .def("get_dim_num", &Shape::GetDimNum)
-      .def("set_dim", &Shape::SetDim)
-      .def("get_dim", &Shape::GetDim)
-      .def("get_dims", &Shape::GetDims)
-      .def("get_shape_size", &Shape::GetShapeSize);
-
-  py::class_<AttrValue>(*m, "GEAttrValue").def(py::init<>());
-
-  py::class_<OperatorFactory>(*m, "GEOperatorFactory")
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def_static("create_operator",
-                  static_cast<ge::Operator (*)(const char *, const char *)>(
-                      &ge::OperatorFactory::CreateOperator))
-#else
-      .def("create_operator", &OperatorFactory::CreateOperator)
-#endif
-      .def("get_ops_type_list",
-           []() -> py::tuple {
-             std::vector<AscendString> all_ops;
-             graphStatus status = OperatorFactory::GetOpsTypeList(all_ops);
-             return py::make_tuple(all_ops, status);
-           })
-#ifdef PADDLE_WITH_ASCEND_STRING
-      .def_static(
-          "is_exist_op",
-          static_cast<bool (*)(const char *)>(&OperatorFactory::IsExistOp));
-#else
-      .def("is_exist_op", &OperatorFactory::IsExistOp);
-#endif
-}
-
-}  // namespace pybind
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h
deleted file mode 100644
index 15fb056c90e02..0000000000000
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-
-namespace py = pybind11;
-
-namespace paddle {
-namespace pybind {
-
-void BindAscendGraph(py::module* m);
-void BindAscendWrapper(py::module* m);
-void BindAscendDevice(py::module* m);
-
-}  // namespace pybind
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 6b5f5cb003c5d..57b62dc40870d 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -2616,19 +2616,6 @@ void BindImperative(py::module *m_ptr) {
            py::arg("ring_id"));
 #endif
 
-#if defined(PADDLE_WITH_ASCEND_CL)
-  py::class_<imperative::HCCLParallelContext,
-             imperative::ParallelContext,
-             std::shared_ptr<imperative::HCCLParallelContext>>(
-      m, "HCCLParallelContext")
-      .def(py::init<const imperative::ParallelStrategy &,
-                    const platform::NPUPlace &>())
-      .def("init", [](imperative::HCCLParallelContext &self) { self.Init(); })
-      .def("init_with_ring_id",
-           &imperative::HCCLParallelContext::InitWithRingID,
-           py::arg("ring_id"));
-#endif
-
 #if defined(PADDLE_WITH_CNCL)
   py::class_<imperative::CNCLParallelContext,
              imperative::ParallelContext,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 4ada8b45065ed..44966f930d3f1 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -772,7 +772,6 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("device_type"),
            py::arg("device_id") = 0,
            py::arg("precision") = AnalysisConfig::Precision::kFloat32)
-      .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
       .def("enable_ipu",
            &AnalysisConfig::EnableIpu,
            py::arg("ipu_device_num") = 1,
@@ -1063,13 +1062,7 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
       .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
       .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
-      .def("run",
-           [](paddle_infer::Predictor &self) {
-#ifdef PADDLE_WITH_ASCEND_CL
-             pybind11::gil_scoped_release release;
-#endif
-             self.Run();
-           })
+      .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 61d071b83d37e..a871b6f6b9aa0 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -139,10 +139,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 6cbe18ec268e3..79dfa27e6dd40 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -139,10 +139,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
@@ -553,57 +549,14 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
     NPUPlace is a descriptor of a device.
     It represents a NPU device on which a tensor will be allocated and a model will run.
-
     Examples:
         .. code-block:: python
-
           # required: npu
-
           import paddle
           place = paddle.NPUPlace(0)
-
         )DOC");
   g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
-  npuplace
-      .def("__init__",
-           [](platform::NPUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_ASCEND_CL
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid NPUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
-               if (platform::GetNPUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetNPUDeviceCount(),
-                     platform::GetNPUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::NPUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use NPU because you have installed CPU/GPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use NPU, please try to install NPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
-                 "If you only have CPU, please change NPUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
+  npuplace.def("__init__", [](platform::NPUPlace &self, int dev_id) {})
       .def("_type", &PlaceIndex<platform::NPUPlace>)
       .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 977c99f30fc5f..e306e0338462f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -154,10 +154,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
@@ -285,13 +281,7 @@ bool IsCompiledWithXPU() {
 #endif
 }
 
-bool IsCompiledWithNPU() {
-#ifndef PADDLE_WITH_ASCEND_CL
-  return false;
-#else
-  return true;
-#endif
-}
+bool IsCompiledWithNPU() { return false; }
 
 bool IsCompiledWithCustomDevice(std::string device_type) {
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
@@ -1606,13 +1596,9 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::NPUPlace &place)
               -> paddle::platform::DeviceContext * {
-#ifndef PADDLE_WITH_ASCEND_CL
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use NPUPlace in CPU/GPU/XPU version, "
                 "Please recompile or reinstall Paddle with NPU support."));
-#else
-                return new paddle::platform::NPUDeviceContext(place);
-#endif
           })
       .def_static("create",
                   [](paddle::platform::CustomPlace &place)
@@ -2338,39 +2324,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  m.def("get_npu_device_count", platform::GetNPUDeviceCount);
-  m.def("npu_finalize", []() {
-    platform::HCCLCommContext::Instance().ReleaseHCCLComms();
-
-    auto &pool = platform::DeviceContextPool::Instance();
-    auto devices = platform::GetSelectedNPUDevices();
-    for (size_t i = 0; i < devices.size(); ++i) {
-      platform::NPUDeviceGuard guard(devices[i]);
-      pool.Get(platform::NPUPlace(devices[i]))->Wait();
-    }
-    platform::AclInstance::Instance().Finalize();
-  });
-
-  py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
-
-  m.def("npu_prof_init", platform::NPUProfilerInit);
-  m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
-    platform::NPUProfilerStart(c.ptr());
-  });
-  m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
-    platform::NPUProfilerStop(c.ptr());
-  });
-  m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
-  m.def("npu_prof_create_config", []() {
-    return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
-  });
-
-  m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
-    platform::NPUProfilerDestroyConfig(c.ptr());
-  });
-#endif
-
 #ifdef PADDLE_WITH_IPU
   m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
 #endif
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index e2e8d7c8837d9..b854fa37ac0ba 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -139,10 +139,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
-
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index e050fc7c7d544..60f7d83f03ac9 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -299,13 +299,6 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
     auto p = self.place();
     paddle::memory::Copy(
         platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
-#endif
-  } else if (platform::is_npu_place(self.place())) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    const T *a = self.data<T>();
-    auto p = self.place();
-    paddle::memory::Copy(
-        platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
 #endif
   } else if (platform::is_custom_place(self.place())) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -350,13 +343,6 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(
         p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
-#endif
-  } else if (platform::is_npu_place(self->place())) {
-#if defined(PADDLE_WITH_ASCEND_CL)
-    auto p = self->place();
-    T *a = self->mutable_data<T>(p);
-    paddle::memory::Copy(
-        p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
 #endif
   } else if (platform::is_custom_place(self->place())) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -427,21 +413,6 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
         "Please recompile or reinstall Paddle with IPU support."));
-#endif
-  } else if (paddle::platform::is_npu_place(place)) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    platform::Place tmp_place = place;
-    platform::NPUDeviceGuard guard(tmp_place.device);
-    auto dst = self->mutable_data<T>(place);
-    platform::NPUMemcpySync(
-        dst, array.data(), array.nbytes(), ACL_MEMCPY_HOST_TO_DEVICE);
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(place);
-    ctx.Wait();
-#else
-    PADDLE_THROW(platform::errors::PermissionDenied(
-        "Cannot use NPUPlace in CPU/GPU/XPU version. "
-        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   } else if (paddle::platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
@@ -1093,39 +1064,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use CUDAPlace in CPU only version, "
         "Please recompile or reinstall Paddle with CUDA support."));
-#endif
-  } else if (is_npu_tensor) {
-#ifdef PADDLE_WITH_ASCEND_CL
-    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
-    PADDLE_ENFORCE_EQ(py_arr.writeable(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "PyArray is not writable, in which case memory leak "
-                          "or double free would occur"));
-    PADDLE_ENFORCE_EQ(
-        py_arr.owndata(),
-        true,
-        platform::errors::InvalidArgument(
-            "PyArray does not own data, in which case  memory leak "
-            "or double free would occur"));
-
-    size_t copy_bytes = sizeof_dtype * numel;
-    auto p = tensor.place();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(tensor.place());
-    paddle::memory::Copy(
-        platform::CPUPlace(),
-        py_arr.mutable_data(),
-        p,
-        tensor_buf_ptr,
-        copy_bytes,
-        reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
-    ctx.Wait();
-    return py_arr;
-#else
-    PADDLE_THROW(platform::errors::PermissionDenied(
-        "Cannot use NPUPlace in CPU/GPU/XPU version, "
-        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   } else if (is_mlu_tensor) {
 #ifdef PADDLE_WITH_MLU
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index a9e1fc384085a..3804ea984f973 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -19,9 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/phi/backends/npu/npu_info.h"
-#endif
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #ifdef PADDLE_WITH_MLU
 #include "paddle/phi/backends/mlu/mlu_info.h"
@@ -44,8 +42,6 @@ inline size_t Alignment(size_t size,
       alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
       alignment = alignment;
-#elif defined(PADDLE_WITH_ASCEND_CL)
-      alignment = phi::backends::npu::NPUMinChunkSize();
 #elif defined(PADDLE_WITH_MLU)
       alignment = phi::backends::mlu::MLUMinChunkSize();
 #else
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 85826fe1cf79f..5225d746f29f0 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -66,11 +66,6 @@ if(WITH_ROCM)
     phi_dynload_warprnnt
     SRCS warprnnt.cc
     DEPS phi_dynamic_loader warprnnt)
-elseif(WITH_ASCEND_CL)
-  cc_library(
-    phi_dynload_warpctc
-    SRCS warpctc.cc
-    DEPS phi_dynamic_loader warpctc npu_hccl)
 else()
   nv_library(
     phi_dynload_cuda
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index c7869e7eea82c..fc32e6fe35ccb 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -522,24 +522,6 @@ void* GetNCCLDsoHandle() {
       FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
 #endif
 }
-void* GetHCCLDsoHandle() {
-  std::string warning_msg(
-      "You may need to install 'hccl2' from Huawei official website: "
-      "before install PaddlePaddle.");
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
-#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
-
-#elif defined(PADDLE_WITH_ASCEND_CL)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_hccl_dir, "libhccl.so", true, {}, warning_msg);
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
-#endif
-}
 
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h
index c8dec39fa8356..e248696e9e689 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.h
+++ b/paddle/phi/backends/dynload/dynamic_loader.h
@@ -38,7 +38,6 @@ void* GetWarpCTCDsoHandle();
 void* GetWarpRNNTDsoHandle();
 void* GetFlashAttnDsoHandle();
 void* GetNCCLDsoHandle();
-void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetLAPACKDsoHandle();
diff --git a/paddle/phi/backends/npu/npu_info.h b/paddle/phi/backends/npu/npu_info.h
deleted file mode 100644
index 21206ae0b28f3..0000000000000
--- a/paddle/phi/backends/npu/npu_info.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-
-namespace phi {
-namespace backends {
-namespace npu {
-
-//! Get the minimum chunk size for NPU buddy allocator.
-inline size_t NPUMinChunkSize() {
-  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
-  // though no document specify that explicitly.
-  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
-  // details.
-  return 1 << 9;
-}
-
-}  // namespace npu
-}  // namespace backends
-}  // namespace phi
-
-#endif
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index b384bed077b27..9cff3acccbd41 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -120,8 +120,7 @@ PADDLE_DEFINE_EXPORTED_bool(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 /**
  * CUDA related related FLAG
@@ -203,37 +202,6 @@ PADDLE_DEFINE_EXPORTED_int64(
     " epilogue algorithms, default is 0, means disabling exhaustive search.");
 #endif
 
-#if defined(PADDLE_WITH_ASCEND_CL)
-PADDLE_DEFINE_EXPORTED_string(
-    selected_npus,
-    "",
-    "A list of device ids separated by comma, like: 0,1,2,3. "
-    "This option is useful when doing multi process training and "
-    "each process have only one device (NPU). If you want to use "
-    "all visible devices, set this to empty string.");
-PADDLE_DEFINE_EXPORTED_bool(
-    hccl_check_nan,
-    true,
-    "Check Nan in tensor before hccl_allreduce_sum otherwise it'll "
-    "core when meets Nan value");
-PADDLE_DEFINE_EXPORTED_string(
-    npu_config_path,
-    "",
-    "The absolute path of configuration json file, like: /tmp/config.json. "
-    "If proveided, it will be passed to aclInit().");
-PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling,
-                             1,
-                             "set minmum loss scaling value!");
-PADDLE_DEFINE_EXPORTED_string(
-    npu_precision_mode,
-    "",
-    "NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
-    "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
-    "'allow_mix_precision'. If you want to use the default mode ("
-    "allow_fp32_to_fp16), set this to empty string. For more details, "
-    "please refer to the documents");
-#endif
-
 /*
  * Kernel related FLAG
  * Name: FLAGS_enable_api_kernel_fallback
@@ -558,8 +526,7 @@ PADDLE_DEFINE_EXPORTED_double(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||      \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
 
 /**
@@ -837,9 +804,8 @@ PADDLE_DEFINE_EXPORTED_bool(use_fast_math,
  * Example:
  * Note: Get host by name time.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) ||      \
-    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MLU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
+    defined(PADDLE_WITH_HIP)
 PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time,
                              120,
                              "The maximum time for get host by name time");
diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h
index e2e2ffec1bfee..4a8cbd38d3df7 100644
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
@@ -52,26 +52,6 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
       PADDLE_THROW(phi::errors::Unavailable(
           ("Paddle is not compiled with XPU. Cannot visit xpu device")));
       return typename Visitor::result_type();
-#endif
-    }
-    case phi::AllocationType::NPU: {
-#ifdef PADDLE_WITH_ASCEND_CL
-      phi::NPUPlace p(place.GetDeviceId());
-      return visitor(p);
-#else
-      PADDLE_THROW(phi::errors::Unavailable(
-          ("Paddle is not compiled with NPU. Cannot visit npu_pinned")));
-      return typename Visitor::result_type();
-#endif
-    }
-    case phi::AllocationType::NPUPINNED: {
-#ifdef PADDLE_WITH_ASCEND_CL
-      phi::NPUPinnedPlace p;
-      return visitor(p);
-#else
-      PADDLE_THROW(phi::errors::Unavailable(
-          ("Paddle is not compiled with NPU. Cannot visit npu_pinned")));
-      return typename Visitor::result_type();
 #endif
     }
     case phi::AllocationType::IPU: {
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index 76ed2ccd1a9b7..23731285926da 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -142,13 +142,6 @@ inline std::vector<T> get_new_data_from_tensor(
     new_data = cpu_starts_tensor.data<T>();
   }
 #endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (new_data_tensor->place().GetType() == phi::AllocationType::NPU) {
-    phi::Copy(
-        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
-    new_data = cpu_starts_tensor.data<T>();
-  }
-#endif
 #ifdef PADDLE_WITH_XPU
   if (new_data_tensor->place().GetType() == phi::AllocationType::XPU) {
     phi::Copy(
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 580aef0ef59af..aca40f83219c9 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,47 +15,23 @@ function(py_test_modules TARGET_NAME)
 
     if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE
                               AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL ""))
-      if(WITH_ASCEND_CL)
-        add_test(
-          NAME ${TARGET_NAME}
-          COMMAND
-            ${CMAKE_COMMAND} -E env
-            PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH}
-            ${py_test_modules_ENVS}
-            COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-            ${PYTHON_EXECUTABLE} -m coverage run --branch -p
-            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      else()
-        add_test(
-          NAME ${TARGET_NAME}
-          COMMAND
-            ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-            ${py_test_modules_ENVS}
-            COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-            ${PYTHON_EXECUTABLE} -m coverage run --branch -p
-            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      endif()
+      add_test(
+        NAME ${TARGET_NAME}
+        COMMAND
+          ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+          ${py_test_modules_ENVS}
+          COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+          ${PYTHON_EXECUTABLE} -m coverage run --branch -p
+          ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     else()
-      if(WITH_ASCEND_CL)
-        add_test(
-          NAME ${TARGET_NAME}
-          COMMAND
-            ${CMAKE_COMMAND} -E env
-            PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH}
-            ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
-            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      else()
-        add_test(
-          NAME ${TARGET_NAME}
-          COMMAND
-            ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-            ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
-            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      endif()
+      add_test(
+        NAME ${TARGET_NAME}
+        COMMAND
+          ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+          ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
+          ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
 
     if(py_test_modules_SERIAL)
diff --git a/test/amp/CMakeLists.txt b/test/amp/CMakeLists.txt
index b4d5bfd6b84bf..60cf0f5fa43d2 100755
--- a/test/amp/CMakeLists.txt
+++ b/test/amp/CMakeLists.txt
@@ -14,47 +14,23 @@ function(py_test_modules TARGET_NAME)
 
     if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE
                               AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL ""))
-      if(WITH_ASCEND_CL)
-        add_test(
-          NAME ${TARGET_NAME}
-          COMMAND
-            ${CMAKE_COMMAND} -E env
-            PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH}
-            ${py_test_modules_ENVS}
-            COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-            ${PYTHON_EXECUTABLE} -m coverage run --branch -p
-            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      else()
-        add_test(
-          NAME ${TARGET_NAME}
-          COMMAND
-            ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-            ${py_test_modules_ENVS}
-            COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-            ${PYTHON_EXECUTABLE} -m coverage run --branch -p
-            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      endif()
+      add_test(
+        NAME ${TARGET_NAME}
+        COMMAND
+          ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+          ${py_test_modules_ENVS}
+          COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+          ${PYTHON_EXECUTABLE} -m coverage run --branch -p
+          ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     else()
-      if(WITH_ASCEND_CL)
-        add_test(
-          NAME ${TARGET_NAME}
-          COMMAND
-            ${CMAKE_COMMAND} -E env
-            PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH}
-            ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
-            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      else()
-        add_test(
-          NAME ${TARGET_NAME}
-          COMMAND
-            ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
-            ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
-            ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
-          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      endif()
+      add_test(
+        NAME ${TARGET_NAME}
+        COMMAND
+          ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python
+          ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE}
+          ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
 
     if(py_test_modules_SERIAL)
diff --git a/test/asp/CMakeLists.txt b/test/asp/CMakeLists.txt
index b48b833b94602..ab9c17edee0ac 100644
--- a/test/asp/CMakeLists.txt
+++ b/test/asp/CMakeLists.txt
@@ -13,10 +13,7 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach()
 
 if(WITH_DISTRIBUTE)
-  if(WITH_GPU
-     OR WITH_XPU
-     OR WITH_ASCEND
-     OR WITH_ASCEND_CL)
+  if(WITH_GPU OR WITH_XPU)
     py_test_modules(test_fleet_with_asp_dynamic MODULES
                     test_fleet_with_asp_dynamic ENVS ${dist_ENVS})
     py_test_modules(test_fleet_with_asp_static MODULES