From 0b60f28c9e47cad858d95769c0007642052dbe08 Mon Sep 17 00:00:00 2001 From: engineer1109 <1292846099@qq.com> Date: Mon, 3 Apr 2023 17:45:44 +0800 Subject: [PATCH] remove WITH_ASCEND_CL PADDLE_WITH_ASCEND_CL WITH_ASCEND_CXX11 (#52448) --- CMakeLists.txt | 28 - cmake/configure.cmake | 4 - cmake/external/ascend.cmake | 108 --- cmake/external/gloo.cmake | 56 +- cmake/external/protobuf.cmake | 7 +- cmake/external/threadpool.cmake | 6 +- cmake/external/warpctc.cmake | 139 +-- cmake/flags.cmake | 4 - cmake/inference_lib.cmake | 16 +- cmake/operators.cmake | 30 - cmake/third_party.cmake | 10 - paddle/fluid/framework/details/CMakeLists.txt | 15 +- .../fluid/framework/details/nan_inf_utils.h | 6 - .../framework/details/nan_inf_utils_detail.cc | 176 ---- paddle/fluid/framework/device_worker.h | 3 +- .../fluid/framework/device_worker_factory.cc | 3 +- paddle/fluid/framework/executor.cc | 17 - paddle/fluid/framework/fleet/CMakeLists.txt | 7 - .../fluid/framework/fleet/ascend_wrapper.cc | 22 - paddle/fluid/framework/fleet/ascend_wrapper.h | 214 ---- paddle/fluid/framework/garbage_collector.cc | 26 - paddle/fluid/framework/garbage_collector.h | 22 - .../interpreter/execution_config.cc | 5 - .../interpreter/interpreter_util.cc | 10 - .../framework/new_executor/interpretercore.cc | 27 - paddle/fluid/framework/operator.cc | 21 - paddle/fluid/framework/parallel_executor.cc | 14 - paddle/fluid/framework/phi_utils.cc | 9 - paddle/fluid/framework/pipeline_trainer.cc | 5 +- paddle/fluid/framework/section_worker.cc | 15 +- paddle/fluid/framework/tensor_test.cc | 66 -- paddle/fluid/framework/tensor_util.cc | 158 +-- paddle/fluid/framework/tensor_util.h | 116 --- paddle/fluid/framework/tensor_util_test.cc | 26 - paddle/fluid/framework/trainer.h | 3 +- paddle/fluid/framework/trainer_factory.cc | 3 +- paddle/fluid/framework/type_defs.h | 22 - paddle/fluid/framework/var_type_traits.h | 12 - .../ir_params_sync_among_devices_pass.cc | 48 - .../ir_params_sync_among_devices_pass.h | 4 - paddle/fluid/inference/api/analysis_config.cc | 29 - .../fluid/inference/api/analysis_predictor.cc | 8 - paddle/fluid/inference/api/api_impl.cc | 17 - paddle/fluid/inference/api/api_impl_tester.cc | 9 - .../inference/api/details/zero_copy_tensor.cc | 38 - .../api/details/zero_copy_tensor_test.cc | 4 - .../inference/api/paddle_analysis_config.h | 6 - paddle/fluid/inference/capi_exp/pd_config.cc | 5 - paddle/fluid/inference/capi_exp/pd_config.h | 8 - paddle/fluid/inference/goapi/config.go | 9 - paddle/fluid/memory/allocation/CMakeLists.txt | 5 - .../memory/allocation/allocator_facade.cc | 35 +- .../memory/allocation/allocator_facade.h | 7 - .../memory/allocation/buddy_allocator.cc | 9 +- .../memory/allocation/buddy_allocator_test.cc | 31 +- .../allocation/naive_best_fit_allocator.cc | 204 ---- .../naive_best_fit_allocator_test.cc | 16 - .../fluid/memory/allocation/npu_allocator.cc | 80 -- .../fluid/memory/allocation/npu_allocator.h | 42 - .../memory/allocation/npu_pinned_allocator.cc | 99 -- .../memory/allocation/npu_pinned_allocator.h | 51 - .../memory/allocation/system_allocator.cc | 129 --- .../memory/allocation/system_allocator.h | 26 - .../allocation/system_allocator_test.cc | 8 - paddle/fluid/memory/memcpy.cc | 424 +------- paddle/fluid/operators/coalesce_tensor_op.cc | 3 +- .../fluid/operators/copy_cross_scope_test.cc | 12 - .../fluid/operators/detection/CMakeLists.txt | 17 +- paddle/fluid/operators/expand_op.h | 7 - paddle/fluid/operators/expand_v2_op.h | 14 - paddle/fluid/operators/math/CMakeLists.txt | 11 +- paddle/fluid/operators/memcpy_d2h_op.cc | 28 - paddle/fluid/operators/norm_op.cc | 4 - paddle/fluid/platform/device/device_wrapper.h | 3 - paddle/fluid/platform/device_context.cc | 25 - paddle/fluid/platform/device_context.h | 98 -- paddle/fluid/platform/device_event.h | 6 - paddle/fluid/platform/device_event_npu.cc | 116 --- .../fluid/platform/dynload/dynamic_loader.cc | 1 - .../fluid/platform/dynload/dynamic_loader.h | 1 - paddle/fluid/platform/gen_comm_id_helper.cc | 5 +- paddle/fluid/platform/gen_comm_id_helper.h | 5 +- paddle/fluid/platform/init.cc | 11 - paddle/fluid/pybind/ascend_wrapper_py.cc | 917 ------------------ paddle/fluid/pybind/ascend_wrapper_py.h | 32 - paddle/fluid/pybind/imperative.cc | 13 - paddle/fluid/pybind/inference_api.cc | 9 +- paddle/fluid/pybind/parallel_executor.cc | 4 - paddle/fluid/pybind/place.cc | 49 +- paddle/fluid/pybind/pybind.cc | 49 +- paddle/fluid/pybind/tensor.cc | 4 - paddle/fluid/pybind/tensor_py.h | 62 -- paddle/phi/backends/device_memory_aligment.h | 6 +- paddle/phi/backends/dynload/CMakeLists.txt | 5 - paddle/phi/backends/dynload/dynamic_loader.cc | 18 - paddle/phi/backends/dynload/dynamic_loader.h | 1 - paddle/phi/backends/npu/npu_info.h | 36 - paddle/phi/core/flags.cc | 42 +- paddle/phi/core/utils/visit_place.h | 20 - .../phi/kernels/funcs/interpolate_function.h | 7 - test/CMakeLists.txt | 56 +- test/amp/CMakeLists.txt | 56 +- test/asp/CMakeLists.txt | 5 +- 103 files changed, 140 insertions(+), 4400 deletions(-) delete mode 100644 paddle/fluid/framework/fleet/ascend_wrapper.cc delete mode 100644 paddle/fluid/framework/fleet/ascend_wrapper.h delete mode 100644 paddle/fluid/memory/allocation/npu_allocator.cc delete mode 100644 paddle/fluid/memory/allocation/npu_allocator.h delete mode 100644 paddle/fluid/memory/allocation/npu_pinned_allocator.cc delete mode 100644 paddle/fluid/memory/allocation/npu_pinned_allocator.h delete mode 100644 paddle/fluid/platform/device_event_npu.cc delete mode 100644 paddle/fluid/pybind/ascend_wrapper_py.cc delete mode 100644 paddle/fluid/pybind/ascend_wrapper_py.h delete mode 100644 paddle/phi/backends/npu/npu_info.h diff --git a/CMakeLists.txt b/CMakeLists.txt index fe10d96261ace..6ab6cbf54ac86 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,10 +58,6 @@ option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) -# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON -# to develop some acl related functionality on x86 -option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) -option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF) option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF) @@ -113,14 +109,6 @@ if(APPLE AND WITH_ARM) set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin") endif() -if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) - if(WITH_ARM_BRPC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") - endif() -endif() - if(WIN32) option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) @@ -525,15 +513,6 @@ if(WITH_DISTRIBUTE) ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE) endif() - if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC) - # disable WITH_PSCORE for NPU before include third_party - message( - WARNING - "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.") - set(WITH_PSCORE - OFF - CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE) - endif() if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496) # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496 message( @@ -567,13 +546,6 @@ if(WITH_RPC) OFF CACHE BOOL "Disable WITH_RPC when not compiled with distribute" FORCE) endif() - if(WITH_ASCEND_CL AND WITH_RPC) - message( - WARNING "Disable WITH_RPC when compiling with NPU. Force WITH_RPC=OFF.") - set(WITH_RPC - OFF - CACHE BOOL "Disable WITH_RPC when compiling with NPU" FORCE) - endif() if(WITH_ROCM AND WITH_RPC) message( WARNING "Disable WITH_RPC when compiling with ROCM. Force WITH_RPC=OFF.") diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 5147e54ea71fc..71e42632b2bd6 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -97,10 +97,6 @@ if(WITH_ASCEND) add_definitions(-DPADDLE_WITH_ASCEND) endif() -if(WITH_ASCEND_CL) - add_definitions(-DPADDLE_WITH_ASCEND_CL) -endif() - if(WITH_ASCEND_INT64) add_definitions(-DPADDLE_WITH_ASCEND_INT64) endif() diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index 3dbe7e6e8aa90..cbddf9496c24f 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -25,111 +25,3 @@ if(EXISTS # It means CANN 20.2 + add_definitions(-DPADDLE_WITH_ASCEND_STRING) endif() - -if(WITH_ASCEND OR WITH_ASCEND_CL) - set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) - set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) - set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) - set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) - set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) - set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) - set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) - - set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} - ${ASCEND_ATC_DIR}) - set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) - set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) - set(ATLAS_RUNTIME_INC_DIR - ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) - set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) - set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) - set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} - ${ATLAS_ATC_DIR}) - - set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) - set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) - set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) - include_directories(${ATLAS_RUNTIME_INC_DIR}) - - add_library(ascend_ge SHARED IMPORTED GLOBAL) - set_property(TARGET ascend_ge PROPERTY IMPORTED_LOCATION - ${atlas_ge_runner_lib}) - - add_library(ascend_graph SHARED IMPORTED GLOBAL) - set_property(TARGET ascend_graph PROPERTY IMPORTED_LOCATION - ${atlas_graph_lib}) - - add_library(atlas_acl SHARED IMPORTED GLOBAL) - set_property(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) - - add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) -endif() - -if(WITH_ASCEND_CL) - set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) - - set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so) - set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) - set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) - set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) - set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) - - message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}") - message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") - include_directories(${FWKACLLIB_INC_DIR}) - include_directories(${ACLLIB_INC_DIR}) - - add_library(ascendcl SHARED IMPORTED GLOBAL) - set_property(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) - - add_library(ascend_hccl SHARED IMPORTED GLOBAL) - set_property(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib}) - - add_library(acl_op_compiler SHARED IMPORTED GLOBAL) - set_property(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION - ${acl_op_compiler_lib}) - add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) -endif() - -if(WITH_ASCEND_CL) - macro(find_ascend_toolkit_version ascend_toolkit_version_info) - file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS) - string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" - ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") - string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1" - ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") - string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION - ${ASCEND_TOOLKIT_VERSION}) - string(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION) - add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}") - if(NOT ASCEND_TOOLKIT_VERSION) - set(ASCEND_TOOLKIT_VERSION "???") - else() - message( - STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}") - endif() - endmacro() - - macro(find_ascend_driver_version ascend_driver_version_info) - file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS) - string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION - "${ASCEND_DRIVER_VERSION_CONTENTS}") - string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1" - ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}") - if(NOT ASCEND_DRIVER_VERSION) - set(ASCEND_DRIVER_VERSION "???") - else() - message( - STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}") - endif() - endmacro() - - if(WITH_ARM) - set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux) - else() - set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux) - endif() - - find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info) - find_ascend_driver_version(${ASCEND_DIR}/driver/version.info) -endif() diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index 574d673b88784..0666d48538b74 100755 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -61,44 +61,24 @@ if(CMAKE_COMPILER_IS_GNUCC) endif() include_directories(${GLOO_INCLUDE_DIR}) -if(WITH_ASCEND OR WITH_ASCEND_CL) - ExternalProject_Add( - ${GLOO_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} - GIT_REPOSITORY ${GLOO_REPOSITORY} - GIT_TAG ${GLOO_TAG} - PREFIX "${GLOO_PREFIX_DIR}" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND - mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake - .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . && - mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy - ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" - "${GLOO_INCLUDE_DIR}/gloo" - BUILD_BYPRODUCTS ${GLOO_LIBRARIES}) -else() - ExternalProject_Add( - ${GLOO_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} - GIT_REPOSITORY ${GLOO_REPOSITORY} - GIT_TAG ${GLOO_TAG} - PREFIX "${GLOO_PREFIX_DIR}" - UPDATE_COMMAND "" - PATCH_COMMAND ${GLOO_PATCH_COMMAND} - CONFIGURE_COMMAND "" - BUILD_COMMAND - mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake - .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . && - mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy - ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" - "${GLOO_INCLUDE_DIR}/gloo" - BUILD_BYPRODUCTS ${GLOO_LIBRARIES}) -endif() +ExternalProject_Add( + ${GLOO_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${GLOO_REPOSITORY} + GIT_TAG ${GLOO_TAG} + PREFIX "${GLOO_PREFIX_DIR}" + UPDATE_COMMAND "" + PATCH_COMMAND ${GLOO_PATCH_COMMAND} + CONFIGURE_COMMAND "" + BUILD_COMMAND + mkdir -p ${GLOO_SOURCE_DIR}/build && cd ${GLOO_SOURCE_DIR}/build && cmake .. + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && ${CMAKE_COMMAND} --build . && mkdir + -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/glo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy + ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" + "${GLOO_INCLUDE_DIR}/gloo" + BUILD_BYPRODUCTS ${GLOO_LIBRARIES}) add_library(gloo STATIC IMPORTED GLOBAL) set_property(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index af3a2c5d84460..7e81c0ab4b856 100755 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -237,9 +237,6 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST) if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) set(PROTOBUF_TAG v21.12) - elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) - set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) - set(PROTOBUF_TAG v21.12) elseif(WITH_IPU) set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) set(PROTOBUF_TAG v21.12) @@ -325,9 +322,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST) endif() endfunction() -if(WITH_ASCEND OR WITH_ASCEND_CL) - set(PROTOBUF_VERSION 21.12) -elseif(WITH_IPU) +if(WITH_IPU) set(PROTOBUF_VERSION 21.12) elseif(WITH_ARM_BRPC) set(PROTOBUF_VERSION 21.12-baidu-ee-common) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 1047465095f42..afeacdc833906 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -15,11 +15,7 @@ include(ExternalProject) set(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) -if(WITH_ASCEND OR WITH_ASCEND_CL) - set(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) -else() - set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) -endif() +set(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) set(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) set(THREADPOOL_INCLUDE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 7f8da7fbe506b..e1e7234da0e25 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -64,96 +64,59 @@ else() set(USE_OMP ON) endif() -if(WITH_ASCEND OR WITH_ASCEND_CL) - ExternalProject_Add( - extern_warpctc - ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} - GIT_REPOSITORY ${WARPCTC_REPOSITORY} - GIT_TAG ${WARPCTC_TAG} - PREFIX ${WARPCTC_PREFIX_DIR} - #UPDATE_COMMAND "" - PATCH_COMMAND "" - BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - -DWITH_GPU=${WITH_GPU} - -DWITH_ROCM=${WITH_ROCM} - -DWITH_OMP=${USE_OMP} - -DWITH_TORCH=OFF - -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON - -DBUILD_TESTS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} - BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}) +if(WIN32) + set(WARPCTC_C_FLAGS $) + set(WARPCTC_C_FLAGS_DEBUG $) + set(WARPCTC_C_FLAGS_RELEASE + $) + set(WARPCTC_CXX_FLAGS $) + set(WARPCTC_CXX_FLAGS_RELEASE + $) + set(WARPCTC_CXX_FLAGS_DEBUG + $) else() - if(WIN32) - set(WARPCTC_C_FLAGS $) - set(WARPCTC_C_FLAGS_DEBUG - $) - set(WARPCTC_C_FLAGS_RELEASE - $) - set(WARPCTC_CXX_FLAGS $) - set(WARPCTC_CXX_FLAGS_RELEASE - $) - set(WARPCTC_CXX_FLAGS_DEBUG - $) - else() - set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS}) - set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) - set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) - set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS}) - set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) - set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) - endif() - ExternalProject_Add( - extern_warpctc - ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} - GIT_REPOSITORY ${WARPCTC_REPOSITORY} - GIT_TAG ${WARPCTC_TAG} - PREFIX ${WARPCTC_PREFIX_DIR} - UPDATE_COMMAND "" - PATCH_COMMAND ${WARPCTC_PATCH_COMMAND} - #BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - -DWITH_GPU=${WITH_GPU} - -DWITH_ROCM=${WITH_ROCM} - -DWITH_OMP=${USE_OMP} - -DWITH_TORCH=OFF - -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON - -DBUILD_TESTS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR} - ${EXTERNAL_OPTIONAL_ARGS} - ${WARPCTC_CCBIN_OPTION} - CMAKE_CACHE_ARGS - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} - BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}) + set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS}) + set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) endif() +ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${WARPCTC_REPOSITORY} + GIT_TAG ${WARPCTC_TAG} + PREFIX ${WARPCTC_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND ${WARPCTC_PATCH_COMMAND} + #BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR} + ${EXTERNAL_OPTIONAL_ARGS} + ${WARPCTC_CCBIN_OPTION} + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}) message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index b880c8028a4f6..5363b1758720d 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -167,10 +167,6 @@ if(NOT WIN32) set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare -Wno-non-virtual-dtor) endif() - if(WITH_ASCEND_CL AND WITH_ARM_BRPC) - set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new) - endif() - if(NOT APPLE) if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM)) set(COMMON_FLAGS diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 505cfd1cab4f1..f5fc9b8b9cf8f 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -508,14 +508,9 @@ function(version version_file) OUTPUT_VARIABLE PADDLE_GIT_COMMIT) file( WRITE ${version_file} - "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" - "WITH_MKL: ${WITH_MKL}\n" - "WITH_MKLDNN: ${WITH_MKLDNN}\n" - "WITH_GPU: ${WITH_GPU}\n" - "WITH_ROCM: ${WITH_ROCM}\n" - "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n" - "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n" - "WITH_IPU: ${WITH_IPU}\n") + "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "WITH_MKL: ${WITH_MKL}\n" + "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n" + "WITH_ROCM: ${WITH_ROCM}\n" "WITH_IPU: ${WITH_IPU}\n") if(WITH_GPU) file(APPEND ${version_file} "CUDA version: ${CUDA_VERSION}\n" @@ -526,11 +521,6 @@ function(version version_file) "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n" "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") endif() - if(WITH_ASCEND_CL) - file(APPEND ${version_file} - "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n" - "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n") - endif() if(WITH_IPU) file(APPEND ${version_file} "PopART version: ${POPART_VERSION}\n") endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 42c7cc5862a9f..34b4536e4e279 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -74,9 +74,6 @@ function(op_library TARGET) set(MKLDNN_FILE) set(op_common_deps operator op_registry math_function layer common_infer_shape_functions) - if(WITH_ASCEND_CL) - set(op_common_deps ${op_common_deps} npu_op_runner) - endif() if(WITH_MLU) set(op_common_deps ${op_common_deps} mlu_baseop) endif() @@ -175,12 +172,6 @@ function(op_library TARGET) list(APPEND xpu_kp_cc_srcs ${TARGET}.kps) endif() endif() - if(WITH_ASCEND_CL) - string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}") - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc) - list(APPEND npu_cc_srcs ${NPU_FILE}.cc) - endif() - endif() if(WITH_MLU) string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}") if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc) @@ -213,8 +204,6 @@ function(op_library TARGET) list(APPEND xpu_kp_cc_srcs ${src}) elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$") list(APPEND xpu_kp_cc_srcs ${src}) - elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$") - list(APPEND npu_cc_srcs ${src}) elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$") list(APPEND mlu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") @@ -331,13 +320,6 @@ function(op_library TARGET) SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() - # deal with CANN version control while registering NPU operators before build - if(WITH_ASCEND_CL) - if(CANN_VERSION LESS 504000) - list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc") - list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc") - endif() - endif() # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) # Combine the cc source files. @@ -541,18 +523,6 @@ function(op_library TARGET) endforeach() endif() - # pybind USE_OP_DEVICE_KERNEL for NPU - if(WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0) - foreach(npu_src ${npu_cc_srcs}) - set(op_name "") - find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name) - if(NOT ${op_name} EQUAL "") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n") - set(pybind_flag 1) - endif() - endforeach() - endif() - # pybind USE_OP_DEVICE_KERNEL for MLU if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0) foreach(mlu_src ${mlu_cc_srcs}) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 59e31b7c9aafa..42474cb801f11 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -394,16 +394,6 @@ if(WITH_BOX_PS) list(APPEND third_party_deps extern_box_ps) endif() -if(WITH_ASCEND OR WITH_ASCEND_CL) - include(external/ascend) - if(WITH_ASCEND OR WITH_ASCEND_CL) - list(APPEND third_party_deps extern_ascend) - endif() - if(WITH_ASCEND_CL) - list(APPEND third_party_deps extern_ascend_cl) - endif() -endif() - if(WITH_PSCORE) include(external/snappy) list(APPEND third_party_deps extern_snappy) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index b13cb45bf988f..820846cacca6b 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -205,17 +205,10 @@ elseif(WITH_ROCM) SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) else() - if(WITH_ASCEND_CL) - cc_library( - nan_inf_utils - SRCS nan_inf_utils_detail.cc - DEPS npu_op_runner framework_proto scope place) - else() - cc_library( - nan_inf_utils - SRCS nan_inf_utils_detail.cc - DEPS framework_proto scope place) - endif() + cc_library( + nan_inf_utils + SRCS nan_inf_utils_detail.cc + DEPS framework_proto scope place) cc_library( all_reduce_op_handle SRCS all_reduce_op_handle.cc diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h index ef2a7d8f0f1e0..ec2c1a45d0fc0 100644 --- a/paddle/fluid/framework/details/nan_inf_utils.h +++ b/paddle/fluid/framework/details/nan_inf_utils.h @@ -54,12 +54,6 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type, } } -#ifdef PADDLE_WITH_ASCEND_CL -void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op, - const framework::Scope& scope, - const platform::Place& place); -#endif - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 59a40ea1f38ab..e3e08e8b7df28 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -19,8 +19,6 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/phi/common/amp_type_traits.h" -#ifdef PADDLE_WITH_ASCEND_CL -#endif #include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" @@ -243,40 +241,6 @@ void CheckVarHasNanOrInf(const std::string& op_type, "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile " "with XPU.", var_name)); -#endif - return; - } else if (platform::is_npu_place(tensor->place())) { -#ifdef PADDLE_WITH_ASCEND_CL - if (framework::TransToProtoVarType(tensor->dtype()) != - proto::VarType::FP32) { - return; - } - - phi::DenseTensor cpu_tensor; - cpu_tensor.Resize(tensor->dims()); - float* cpu_data = static_cast( - cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype())); - - framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); - bool flag = false; - for (int i = 0; i < cpu_tensor.numel(); i++) { - if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { - flag = true; - break; - } - } - PADDLE_ENFORCE_NE( - flag, - true, - platform::errors::Fatal( - "Operator %s output phi::DenseTensor %s contains Inf.", - op_type, - var_name)); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile " - "with NPU.", - var_name)); #endif return; } @@ -309,139 +273,6 @@ bool IsSkipOp(const framework::OperatorBase& op) { return false; } -#ifdef PADDLE_WITH_ASCEND_CL -using NpuOpRunner = paddle::operators::NpuOpRunner; - -constexpr int FLOAT_STATUS_SIZE = 8; - -static phi::DenseTensor& npu_float_status() { - static phi::DenseTensor float_status; - return float_status; -} - -void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op, - const framework::Scope& scope, - const platform::Place& place) { - if (!platform::is_npu_place(place)) return; - - std::call_once(white_list_init_flag, InitWhiteListFormEnv); - if (IsSkipOp(op)) return; - - auto* dev_ctx = reinterpret_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto stream = dev_ctx->stream(); - - auto& flag = npu_float_status(); - flag.mutable_data({FLOAT_STATUS_SIZE}, place); - NpuOpRunner("NPUAllocFloatStatus", {}, {flag}).Run(stream); - - phi::DenseTensor tmp; - tmp.mutable_data({FLOAT_STATUS_SIZE}, place); - NpuOpRunner("NPUClearFloatStatus", {tmp}, {flag}).Run(stream); -} - -void PrintNpuVarInfo(const std::string& op_type, - const std::string& var_name, - const framework::Variable* var, - const platform::Place& place) { - const phi::DenseTensor* tensor{nullptr}; - if (var->IsType()) { - tensor = &var->Get(); - } else if (var->IsType()) { - tensor = &var->Get().value(); - } else { - VLOG(10) << var_name << " var_name need not to check"; - return; - } - - if ((framework::TransToProtoVarType(tensor->dtype()) != - proto::VarType::FP32) && - (framework::TransToProtoVarType(tensor->dtype()) != - proto::VarType::FP16)) { - return; - } - - if (tensor->memory_size() == 0) { - VLOG(10) << var_name << " var_name need not to check, size == 0"; - return; - } - - VLOG(10) << "begin check " << op_type << " var_name:" << var_name - << ", place:" << tensor->place() << ", numel:" << tensor->numel(); - - phi::DenseTensor cpu_tensor; - cpu_tensor.Resize(tensor->dims()); - cpu_tensor.mutable_data(platform::CPUPlace(), tensor->dtype()); - framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); - - LOG(WARNING) << "print [" << var_name << "] tensor info:"; - // use env strategy control in future, -1=print_all. - int print_num = 3; - if (framework::TransToProtoVarType(tensor->dtype()) == proto::VarType::FP32) { - const float* value = cpu_tensor.data(); - PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false); - } else if (framework::TransToProtoVarType(tensor->dtype()) == - proto::VarType::FP16) { - const paddle::platform::float16* value = - cpu_tensor.data(); - PrintNanInf(value, tensor->numel(), print_num, op_type, var_name, false); - } -} - -void PrintNPUOpValueInfo(const framework::OperatorBase& op, - const framework::Scope& scope, - const platform::Place& place) { - LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type() - << "), here we print some tensor value info of this op."; - for (auto& vname : op.InputVars()) { - auto* var = scope.FindVar(vname); - if (var == nullptr) continue; - PrintNpuVarInfo(op.Type(), vname, var, place); - } - - for (auto& vname : op.OutputVars(true)) { - auto* var = scope.FindVar(vname); - if (var == nullptr) continue; - PrintNpuVarInfo(op.Type(), vname, var, place); - } -} - -static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op, - const framework::Scope& scope, - const platform::Place& place) { - if (!platform::is_npu_place(place)) return; - - auto* dev_ctx = reinterpret_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto stream = dev_ctx->stream(); - - auto& flag = npu_float_status(); - phi::DenseTensor tmp; - tmp.mutable_data({FLOAT_STATUS_SIZE}, place); - // NPUGetFloatStatus updates data on input in-place. - // tmp is only placeholder. - NpuOpRunner("NPUGetFloatStatus", {flag}, {tmp}).Run(stream); - - phi::DenseTensor cpu_tensor; - auto cpu_place = platform::CPUPlace(); - float* cpu_data = static_cast( - cpu_tensor.mutable_data({FLOAT_STATUS_SIZE}, cpu_place)); - - framework::TensorCopySync(flag, cpu_place, &cpu_tensor); - float sum = 0.0; - for (int i = 0; i < FLOAT_STATUS_SIZE; ++i) { - sum += cpu_data[i]; - } - - if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place); - - PADDLE_ENFORCE_LT(sum, - 1.0, - platform::errors::PreconditionNotMet( - "Operator %s contains Nan/Inf.", op.Type())); -} -#endif - void CheckOpHasNanOrInf(const framework::OperatorBase& op, const framework::Scope& exec_scope, const platform::Place& place) { @@ -449,13 +280,6 @@ void CheckOpHasNanOrInf(const framework::OperatorBase& op, if (IsSkipOp(op)) return; -#ifdef PADDLE_WITH_ASCEND_CL - if (platform::is_npu_place(place)) { - NPUCheckOpHasNanOrInf(op, exec_scope, place); - return; - } -#endif - if (op_var_nan_inf_white_list().count(op.Type()) == 0) { // NOTE. vname may destruct in the end of this func. for (auto& vname : op.OutputVars(true)) { diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 770c51e0012dd..743513e38aad1 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -674,8 +674,7 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index 05e4fec365b69..ae01f622effa8 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -83,8 +83,7 @@ REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker); REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index c35c1138df90c..c4384ea823f48 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -516,23 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle")); -#endif - } else if (platform::is_npu_place(place_)) { -#ifdef PADDLE_WITH_ASCEND_CL - if (IsFastEagerDeletionModeEnabled()) { - VLOG(4) << "Use unsafe fast gc for NPU."; - gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size)); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Please set FLAGS_fast_eager_deletion_mode=true to use " - "GarbageCollector on NPU.")); - // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. - VLOG(4) << "Use default stream gc for NPU."; - gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size)); - } -#else - PADDLE_THROW( - platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle")); #endif } else if (platform::is_mlu_place(place_)) { #ifdef PADDLE_WITH_MLU diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 10fb82e23049f..7ebc58e61b588 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -124,10 +124,3 @@ cc_test( test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) - -if(WITH_ASCEND OR WITH_ASCEND_CL) - cc_library( - ascend_wrapper - SRCS ascend_wrapper.cc - DEPS framework_proto lod_tensor ascend_ge ascend_graph) -endif() diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc deleted file mode 100644 index 273939f6bee61..0000000000000 --- a/paddle/fluid/framework/fleet/ascend_wrapper.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/framework/fleet/ascend_wrapper.h" -namespace paddle { -namespace framework { -std::shared_ptr AscendInstance::ascend_instance_ = nullptr; -} // end namespace framework -} // end namespace paddle -#endif diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h deleted file mode 100644 index 372f0e7d38be0..0000000000000 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ /dev/null @@ -1,214 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef PADDLE_WITH_ASCEND_CL -#include - -#include -#include -#include -#include - -#include "ge/ge_api.h" -#include "graph/attr_value.h" -#include "graph/tensor.h" -#include "graph/types.h" -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/timer.h" - -namespace paddle { -namespace framework { - -typedef ge::Graph AscendGraphDesc; - -#ifdef PADDLE_WITH_ASCEND_STRING -using AscendString = ge::AscendString; -#else -using AscendString = std::string; -#endif - -class AscendInstance { - public: - virtual ~AscendInstance() {} - AscendInstance() {} - - std::map _GetDefaultInitOptions() { - std::map init_options; - init_options["ge.exec.deviceId"] = "0"; - init_options["ge.graphRunMode"] = "1"; - return init_options; - } - - std::map _GetDefaultInitSessionOptions() { - std::map init_options; - // init_options["a"] = "b"; - // init_options["ge.trainFlag"] = "1"; - return init_options; - } - - ge::Status InitGEForUT() { - return ge::GEInitialize(_GetDefaultInitOptions()); - } - - void InitGlobalResouces() { - LOG(INFO) << "Begin ascend InitGlobalResouces"; - session_.reset(new ge::Session(_GetDefaultInitSessionOptions())); - if (session_ == nullptr) { - PADDLE_THROW(platform::errors::Fatal("new session error: nullptr")); - } - LOG(INFO) << "End ascend InitGlobalResouces"; - } - - void DestroyGlobalResouces() { - LOG(INFO) << "Begin ascend DestroyGlobalResouces"; - session_ = nullptr; - LOG(INFO) << "Begin ascend DestroyGlobalResouces"; - } - - static std::shared_ptr GetInstance() { - if (nullptr == ascend_instance_) { - ascend_instance_.reset(new paddle::framework::AscendInstance()); - VLOG(1) << "Initialize AscendInstance Done"; - } - return ascend_instance_; - } - - void AddAscendSubgraph(int graph_idx, const AscendGraphDesc &graph) { - ge::Status status = session_->AddGraph(graph_idx, graph); - PADDLE_ENFORCE_EQ(status, - ge::SUCCESS, - paddle::platform::errors::PreconditionNotMet( - "Calling addGraph of graph engine failed, please " - "check Ascend Log.")); - VLOG(1) << "AddAscendSubgraph " << graph_idx << " Done"; - } - - ge::DataType VarTypeToGeType(proto::VarType::Type type) { - if (type == proto::VarType::FP16) { - return ge::DataType::DT_FLOAT16; - } else if (type == proto::VarType::FP32) { - return ge::DataType::DT_FLOAT; - } else if (type == proto::VarType::FP64) { - return ge::DataType::DT_DOUBLE; - } else if (type == proto::VarType::INT32) { - return ge::DataType::DT_INT32; - } else if (type == proto::VarType::INT64) { - return ge::DataType::DT_INT64; - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Not support %s as tensor type.", DataTypeToString(type))); - } - } - int GeTypeSize(proto::VarType::Type type) { - if (type == proto::VarType::FP16) { - return 2; - } else if (type == proto::VarType::FP32) { - return 4; - } else if (type == proto::VarType::FP64) { - return 8; - } else if (type == proto::VarType::INT32) { - return 4; - } else if (type == proto::VarType::INT64) { - return 8; - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Not support %s as tensor type.", DataTypeToString(type))); - } - } - ge::Tensor ConvertToGeTensor(const phi::DenseTensor *tensor) { - auto numel = tensor->numel(); - std::vector vec_dim; - auto dimen = arity(tensor->dims()); - for (auto i = 0; i < dimen; ++i) { - vec_dim.push_back(tensor->dims()[i]); - } - // For Debug - // VLOG(1) << "input numel: " << numel << ", dimen is " << vec_dim.size() << - // ", and shape is"; - // for (const auto e : vec_dim) { - // VLOG(0) << e; - // } - - ge::Shape shape(vec_dim); - ge::TensorDesc tensor_desc( - shape, - ge::Format::FORMAT_ND, - VarTypeToGeType(framework::TransToProtoVarType(tensor->dtype()))); - tensor_desc.SetRealDimCnt(vec_dim.size()); - - const uint8_t *data = reinterpret_cast(tensor->data()); - std::vector dst( - numel * GeTypeSize(framework::TransToProtoVarType(tensor->dtype()))); - memcpy(dst.data(), - data, - GeTypeSize(framework::TransToProtoVarType(tensor->dtype())) * numel); - ge::Tensor ge_tensor(tensor_desc, dst); - return ge_tensor; - } - - void RunAscendSubgraph(int graph_idx, - const std::vector &inputs, - std::vector *outputs) { - VLOG(1) << "Ascend Graph[" << graph_idx << "] is about to run."; - // Convert paddle phi::DenseTensor to GE phi::DenseTensor - std::vector ge_inputs; - for (const auto &e : inputs) { - ge_inputs.push_back(ConvertToGeTensor(e)); - } - - // Run Graph - std::vector ge_outputs; - ge::Status status = session_->RunGraph(graph_idx, ge_inputs, ge_outputs); - PADDLE_ENFORCE_EQ(status, - ge::SUCCESS, - paddle::platform::errors::PreconditionNotMet( - "Calling RunGraph of graph engine failed, please " - "check Ascend Log.")); - VLOG(1) << "Run Ascend Graph[" << graph_idx << "] Done"; - - // change tensor back, note all tensor's type computed in GE is uint8 - for (size_t i = 0; i < ge_outputs.size(); ++i) { - const uint8_t *ret_data = ge_outputs[i].GetData(); - size_t size = ge_outputs[i].GetSize(); - VLOG(1) << "GE phi::DenseTensor size of the " << i << "th output var is " - << size; - auto *dst = (*outputs)[i]->mutable_data({(int64_t)size}, - platform::CPUPlace()); - memcpy(dst, ret_data, size); - - // Following for debug: - // VLOG(0) << "output for " << i << " var: "; - // float *tmp = reinterpret_cast(dst); - // for (size_t j = 0; j < size / 4; ++j) { - // printf("%f ", tmp[j]); - // } - // printf("\n"); - } - } - - protected: - std::shared_ptr session_; - - private: - static std::shared_ptr ascend_instance_; -}; -} // namespace framework -} // namespace paddle -#endif diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 77a666a24d9ea..7c4b3d5c440bd 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -125,32 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback( } #endif -#ifdef PADDLE_WITH_ASCEND_CL -NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector( - const platform::NPUPlace &place, size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} - -void NPUDefaultStreamGarbageCollector::Wait() const { - static_cast(this->dev_ctx_) - ->WaitStreamCallback(); -} - -void NPUDefaultStreamGarbageCollector::ClearCallback( - const std::function &callback) { - static_cast(this->dev_ctx_) - ->AddStreamCallback(callback); -} -NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector( - const platform::NPUPlace &place, size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} - -void NPUUnsafeFastGarbageCollector::ClearCallback( - const std::function &callback) { - callback(); -} - -#endif - #ifdef PADDLE_WITH_MLU MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector( const platform::MLUPlace &place, size_t max_memory_size) diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index b75994536037a..14d38363dbe06 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -139,28 +139,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector { }; #endif -#ifdef PADDLE_WITH_ASCEND_CL -class NPUDefaultStreamGarbageCollector : public GarbageCollector { - public: - NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place, - size_t max_memory_size); - - void Wait() const override; - - protected: - void ClearCallback(const std::function &callback) override; -}; - -class NPUUnsafeFastGarbageCollector : public GarbageCollector { - public: - NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place, - size_t max_memory_size); - - protected: - void ClearCallback(const std::function &callback) override; -}; -#endif - #ifdef PADDLE_WITH_MLU class MLUDefaultStreamGarbageCollector : public GarbageCollector { public: diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc index 9de402450d5df..1e6a6f02e2230 100644 --- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc @@ -60,11 +60,6 @@ inline std::tuple GetThreadPoolConfig(const phi::Place& place, if (platform::is_xpu_place(place)) { #if defined(PADDLE_WITH_XPU) device_count = phi::backends::xpu::GetXPUDeviceCount(); -#endif - } - if (platform::is_npu_place(place)) { -#if defined(PADDLE_WITH_ASCEND_CL) - device_count = platform::GetNPUDeviceCount(); #endif } if (platform::is_ipu_place(place)) { diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 8ba9e7a70e590..29626988132f9 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -631,16 +631,6 @@ void BuildOpFuncList(const platform::Place& place, VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope); -#ifdef PADDLE_WITH_ASCEND_CL - // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable - // values, but only through special `float_status` to checks whether - // the operation is overflow. More about `float_status`, see: - // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue - if (FLAGS_check_nan_inf) { - framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place); - } -#endif - try { if (dynamic_cast(op) == nullptr) { VLOG(4) << "HandleOperatorBase"; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 8d38da543ad03..bee8e8ca7b795 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -87,16 +87,6 @@ inline void SetDeviceId(const platform::Place& place) { #else auto dev_id = place.device; platform::SetXPUDeviceId(dev_id); -#endif - } else if (platform::is_npu_place(place)) { -#ifndef PADDLE_WITH_ASCEND_CL - PADDLE_THROW(platform::errors::Unavailable( - "Cannot run operator on place %s, please recompile paddle or " - "reinstall Paddle with NPU support.", - place)); -#else - auto dev_id = place.device; - platform::SetNPUDeviceId(dev_id); #endif } else if (platform::is_custom_place(place)) { #ifndef PADDLE_WITH_CUSTOM_DEVICE @@ -218,11 +208,6 @@ void InterpreterCore::RunImpl() { async_work_queue_ = GetWorkQueue(); ExecuteInstructionList(vec_instruction_); } -#ifdef PADDLE_WITH_ASCEND_CL - if (platform::is_npu_place(place_)) { - platform::DeviceContextPool::Instance().Get(place_)->Wait(); - } -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE if (platform::is_custom_place(place_)) { platform::DeviceContextPool::Instance().Get(place_)->Wait(); @@ -893,18 +878,6 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) { : var_scope_.GetMutableScope(); VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope); -#ifdef PADDLE_WITH_ASCEND_CL - if (platform::is_npu_place(place)) { - // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the - // variable values, but only through special `float_status` to checks - // whether the operation is overflow. More about `float_status`, see: - // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue - if (FLAGS_check_nan_inf) { - framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place); - } - } -#endif - auto op_with_kernel = dynamic_cast(op); { // If it is OperatorBase, InferShape do nothing. diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 971b377c2afe9..6a46a03b9adce 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -770,16 +770,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { #else auto dev_id = place.device; platform::SetXPUDeviceId(dev_id); -#endif - } else if (platform::is_npu_place(place)) { -#ifndef PADDLE_WITH_ASCEND_CL - PADDLE_THROW(platform::errors::Unavailable( - "Cannot run operator on place %s, please recompile paddle or " - "reinstall Paddle with NPU support.", - place)); -#else - auto dev_id = place.device; - platform::SetNPUDeviceId(dev_id); #endif } else if (platform::is_mlu_place(place)) { #ifndef PADDLE_WITH_MLU @@ -1692,17 +1682,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); bool fallback_to_cpu = false; auto* dev_ctx = pool.Get(place); - -#ifdef PADDLE_WITH_ASCEND_CL - // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable - // values, but only through special `float_status` to checks whether - // the operation is overflow. More about `float_status`, see: - // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue - if (FLAGS_check_nan_inf) { - framework::details::NPUAllocAndClearFloatStatus(*this, scope, place); - } -#endif - // using cache if (kernel_type_.get()) { dev_ctx = pool.Get(kernel_type_->place_); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 144651f1b63cc..1c703a25bea38 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -553,20 +553,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use IPU device since it's not compiled with IPU," "Please recompile or reinstall Paddle with IPU support.")); -#endif - } else if (platform::is_npu_place(place)) { -#if defined(PADDLE_WITH_ASCEND_CL) - if (IsFastEagerDeletionModeEnabled()) { - gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size)); - } else { - gc.reset(new NPUUnsafeFastGarbageCollector(place, max_memory_size)); - } - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use NPU device since it's not compiled with " - "NPU," - "Please recompile or reinstall Paddle with NPU support.")); #endif } else if (platform::is_custom_place(place)) { #if defined(PADDLE_WITH_CUSTOM_DEVICE) diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index f2fa4c24ae2ae..b4b25726964f3 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key, phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - if (kernel_key.backend() == phi::Backend::NPU) { - VLOG(3) << "phi missing NPU kernel: " << op.Type() - << ", expected_kernel_key:" << kernel_key - << ", fallback to CPU one!"; - return phi::KernelKey( - phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); - } -#endif #ifdef PADDLE_WITH_MLU if (kernel_key.backend() == phi::Backend::MLU) { VLOG(3) << "phi missing MLU kernel: " << op.Type() diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 7bcb6ed6f14b5..bf3a0ea31cf25 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" @@ -37,8 +36,6 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, int place_id = section_config.place_id(); #if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL) place_ = platform::CUDAPlace(place_id); -#elif (defined PADDLE_WITH_ASCEND_CL) // NOLINT - place_ = platform::NPUPlace(place_id); #endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index ed04b1622b04f..58e879a5011c2 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,8 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include #include "paddle/fluid/framework/device_worker.h" @@ -235,18 +234,6 @@ void SectionWorker::TrainFiles() { gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size)); } } -#elif defined(PADDLE_WITH_ASCEND_CL) - if (IsFastEagerDeletionModeEnabled()) { - VLOG(4) << "Use unsafe fast gc for NPU."; - gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size)); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Please set FLAGS_fast_eager_deletion_mode=true to use " - "GarbageCollector on NPU.")); - // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. - VLOG(4) << "Use default stream gc for NPU."; - gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size)); - } #endif } // max_memory_size >= 0 diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 42690c071bc4c..5ef6f53d38d50 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -143,35 +143,6 @@ TEST(DenseTensor, MutableData) { EXPECT_EQ(p1, p2); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - { - phi::DenseTensor src_tensor; - float* p1 = nullptr; - float* p2 = nullptr; - // initialization - p1 = src_tensor.mutable_data(phi::make_ddim({1, 2, 3}), - platform::NPUPlace(0)); - auto p1_holder = src_tensor.Holder(); - EXPECT_NE(p1, nullptr); - // set src_tensor a new dim with large size - // momery is supposed to be re-allocated - p2 = src_tensor.mutable_data(phi::make_ddim({3, 1024}), - platform::NPUPlace(0)); - auto p2_holder = src_tensor.Holder(); - EXPECT_NE(p2, nullptr); - EXPECT_NE(p1_holder.get(), p2_holder.get()); - // set src_tensor a new dim with same size - // momery block is supposed to be unchanged - p1 = src_tensor.mutable_data(phi::make_ddim({2, 2, 3}), - platform::NPUPlace(0)); - EXPECT_EQ(p1, p2); - // set src_tensor a new dim with smaller size - // momery block is supposed to be unchanged - p2 = src_tensor.mutable_data(phi::make_ddim({2, 2}), - platform::NPUPlace(0)); - EXPECT_EQ(p1, p2); - } -#endif } TEST(DenseTensor, ShareDataWith) { @@ -207,16 +178,6 @@ TEST(DenseTensor, ShareDataWith) { ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - { - phi::DenseTensor src_tensor; - phi::DenseTensor dst_tensor; - src_tensor.mutable_data(phi::make_ddim({2, 3, 4}), - platform::NPUPlace(0)); - dst_tensor.ShareDataWith(src_tensor); - ASSERT_EQ(src_tensor.data(), dst_tensor.data()); - } -#endif } TEST(DenseTensor, Slice) { @@ -271,33 +232,6 @@ TEST(DenseTensor, Slice) { EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); } #endif - -#ifdef PADDLE_WITH_ASCEND_CL - { - phi::DenseTensor src_tensor; - src_tensor.mutable_data(phi::make_ddim({6, 9}), - platform::NPUPlace(0)); - phi::DenseTensor slice_tensor = src_tensor.Slice(2, 6); - phi::DDim slice_dims = slice_tensor.dims(); - ASSERT_EQ(arity(slice_dims), 2); - EXPECT_EQ(slice_dims[0], 4); - EXPECT_EQ(slice_dims[1], 9); - - uintptr_t src_data_address = - reinterpret_cast(src_tensor.data()); - uintptr_t src_mutable_data_address = - reinterpret_cast(src_tensor.mutable_data( - src_tensor.dims(), platform::NPUPlace(0))); - uintptr_t slice_data_address = - reinterpret_cast(slice_tensor.data()); - uintptr_t slice_mutable_data_address = - reinterpret_cast(slice_tensor.mutable_data( - slice_tensor.dims(), platform::NPUPlace(0))); - EXPECT_EQ(src_data_address, src_mutable_data_address); - EXPECT_EQ(slice_data_address, slice_mutable_data_address); - EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); - } -#endif } TEST(DenseTensor, ReshapeToMatrix) { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 91b87a98447ce..4c69bdd0ff502 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -125,112 +125,6 @@ void TensorCopyImpl(const TENSOR& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - // TODO(zhiqiu): handle different condition like CUDA code below - else if (platform::is_npu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); - } - else if (platform::is_cpu_place(src_place) && // NOLINT - platform::is_npu_place(dst_place)) { - // 1. cpu tensor -> npu pinned tensor - platform::NPUPinnedPlace npu_pinned_place; - phi::DenseTensor npu_pinned_tensor; - npu_pinned_tensor.Resize(src.dims()); - auto npu_pinned_ptr = - npu_pinned_tensor.mutable_data(npu_pinned_place, src.dtype()); - memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); - - // 2. async copy npu pinned tensor -> npu tensor - memory::Copy( - dst_place, - dst_ptr, - npu_pinned_place, - npu_pinned_ptr, - size, - reinterpret_cast(ctx).stream()); - - // 3. record event - auto npu_pinned_allocator = - static_cast( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(npu_pinned_place) - .get()); - phi::Allocation* allocation = npu_pinned_tensor.Holder().get(); - npu_pinned_allocator->RecordEvent( - allocation, - reinterpret_cast(ctx).stream()); - } - else if (platform::is_npu_place(src_place) && // NOLINT - platform::is_npu_place(dst_place)) { - if (src_ptr == dst_ptr) { - VLOG(3) << "Skip copy the same data async from " << src_place << " to " - << dst_place; - return; - } - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); - } - else if (platform::is_npu_pinned_place(src_place) && // NOLINT - platform::is_npu_place(dst_place)) { /* npu_pinned->npu */ - auto src_npu_pinned_place = src_place; - auto dst_npu_place = dst_place; - auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE_EQ( - platform::is_npu_place(ctx_place), - true, - platform::errors::PreconditionNotMet( - "Device context place mismatch. When copying phi::DenseTensor " - "data from NPU Pinned memory to NPU memory, current " - "device context place should be NPU.")); - auto ctx_npu_place = ctx_place; - PADDLE_ENFORCE_EQ(dst_npu_place, - ctx_npu_place, - platform::errors::PreconditionNotMet( - "The target NPU device and current device context do " - "not match. The target NPU device number is %d, but " - "device context NPU number is %d.", - dst_npu_place.device, - ctx_npu_place.device)); - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy( - dst_npu_place, dst_ptr, src_npu_pinned_place, src_ptr, size, stream); - } - else if (platform::is_npu_place(src_place) && // NOLINT - platform::is_npu_pinned_place(dst_place)) { /* npu->npu_pinned */ - auto src_npu_place = src_place; - auto dst_npu_pinned_place = dst_place; - auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE_EQ( - platform::is_npu_place(ctx_place), - true, - platform::errors::PreconditionNotMet( - "Device context place mismatch. When copying phi::DenseTensor " - "data from NPU memory to NPU Pinned memory, current " - "device context place should be NPU.")); - auto ctx_npu_place = ctx_place; - PADDLE_ENFORCE_EQ(src_place, - ctx_npu_place, - platform::errors::PreconditionNotMet( - "The source NPU device and current device context do " - "not match. The source NPU device number is %d, but " - "device context NPU number is %d.", - src_npu_place.device, - ctx_npu_place.device)); - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy( - dst_npu_pinned_place, dst_ptr, src_npu_place, src_ptr, size, stream); - } - else { // NOLINT - PADDLE_THROW(platform::errors::Unimplemented( - "Copy from %s to %s is not supported.", src_place, dst_place)); - } -#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { @@ -539,29 +433,6 @@ void TensorCopySync(const phi::DenseTensor& src, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - else if (platform::is_npu_place(src_place) && // NOLINT - platform::is_cpu_place(dst_place)) { /* npu -> cpu*/ - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } - else if (platform::is_cpu_place(src_place) && // NOLINT - platform::is_npu_place(dst_place)) { /* cpu -> npu*/ - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } - else if (platform::is_npu_place(src_place) && // NOLINT - platform::is_npu_place(dst_place)) { /* npu -> npu*/ - if (src_ptr == dst_ptr) { - VLOG(3) << "Skip copy the same data sync from " << src_place << " to " - << dst_place; - return; - } - memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); - } - else { // NOLINT - PADDLE_THROW(platform::errors::Unimplemented( - "Copy from %s to %s is not supported.", src_place, dst_place)); - } -#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { @@ -758,31 +629,6 @@ void TensorToStream(std::ostream& os, #else PADDLE_THROW(platform::errors::Unimplemented( "MLUPlace is not supported when not compiled with MLU")); -#endif - } else if (platform::is_npu_place(tensor.place())) { -#ifdef PADDLE_WITH_ASCEND_CL - constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB - std::unique_ptr buf(new char[kBufSize]); - auto& npu_dev_ctx = - static_cast(dev_ctx); - platform::CPUPlace cpu; - uintptr_t data = reinterpret_cast(data_ptr); - while (size != 0) { - size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, - buf.get(), - tensor.place(), - reinterpret_cast(data), - size_to_write, - npu_dev_ctx.stream()); - npu_dev_ctx.Wait(); - os.write(buf.get(), size_to_write); - data += size_to_write; - size -= size_to_write; - } -#else - PADDLE_THROW(platform::errors::Unimplemented( - "NPUPlace is not supported when not compiled with NPU")); #endif } else if (platform::is_custom_place(tensor.place())) { #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -875,7 +721,7 @@ void TensorFromStream(std::istream& is, platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(phi::make_ddim(shape)); framework::VisitDataType( @@ -958,7 +804,7 @@ void TensorFromStream(std::istream& is, platform::is_custom_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CUSTOM_DEVICE) + defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DenseTensor cpu_tensor; cpu_tensor.Resize(phi::make_ddim(dims)); framework::VisitDataType( diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 35a612678cb3e..196487bda96aa 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -25,9 +25,6 @@ limitations under the License. */ #include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" -#endif #include "paddle/fluid/platform/device_context.h" #ifdef PADDLE_WITH_MLU #include "paddle/fluid/platform/device/mlu/device_context.h" @@ -145,37 +142,6 @@ void TensorFromArray(const T* src, reinterpret_cast(ctx).stream()); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - else if (platform::is_npu_place(dst_place)) { // NOLINT - // 1. vector -> npu pinned tensor - platform::NPUPinnedPlace npu_pinned_place; - phi::DenseTensor npu_pinned_tensor; - npu_pinned_tensor.Resize(dst->dims()); - auto npu_pinned_ptr = - npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype()); - memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); - - // 2. async copy npu pinned tensor -> npu tensor - memory::Copy( - dst_place, - dst_ptr, - npu_pinned_place, - npu_pinned_ptr, - size, - reinterpret_cast(ctx).stream()); - - // 3. record event - auto npu_pinned_allocator = - static_cast( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(npu_pinned_place) - .get()); - phi::Allocation* allocation = npu_pinned_tensor.Holder().get(); - npu_pinned_allocator->RecordEvent( - allocation, - reinterpret_cast(ctx).stream()); - } -#endif #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); @@ -227,42 +193,6 @@ void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from - // cudaMemcpyAsync. - // cudaMemcpyAsync is actually "sync" between cpu <-> gpu. - // aclrtMemcpyAsync is really "async" between cpu <-> npu. - // Since vector is on cpu, I think this function should be a "sync" operation, - // so pass nullptr as stream to memory::Copy(). - else if (platform::is_npu_place(dst_place)) { // NOLINT - // 1. vector -> npu pinned tensor - phi::DenseTensor npu_pinned_tensor(dst->dtype()); - platform::NPUPinnedPlace npu_pinned_place; - auto npu_pinned_ptr = - npu_pinned_tensor.mutable_data(dst->dims(), npu_pinned_place); - memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); - - // 2. async copy npu pinned tensor -> npu tensor - memory::Copy( - dst_place, - dst_ptr, - npu_pinned_place, - npu_pinned_ptr, - size, - reinterpret_cast(ctx).stream()); - - // 3. record event - auto npu_pinned_allocator = - static_cast( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(npu_pinned_place) - .get()); - phi::Allocation* allocation = npu_pinned_tensor.Holder().get(); - npu_pinned_allocator->RecordEvent( - allocation, - reinterpret_cast(ctx).stream()); - } -#endif #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(dst_place)) { // NOLINT memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); @@ -324,37 +254,6 @@ inline void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - else if (platform::is_npu_place(dst_place)) { // NOLINT - // 1. vector -> npu pinned tensor - platform::NPUPinnedPlace npu_pinned_place; - phi::DenseTensor npu_pinned_tensor; - npu_pinned_tensor.Resize(dst->dims()); - auto npu_pinned_ptr = - npu_pinned_tensor.mutable_data(npu_pinned_place, dst->dtype()); - memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); - - // 2. async copy npu pinned tensor -> npu tensor - memory::Copy( - dst_place, - dst_ptr, - npu_pinned_place, - npu_pinned_ptr, - size, - reinterpret_cast(ctx).stream()); - - // 3. record event - auto npu_pinned_allocator = - static_cast( - paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(npu_pinned_place) - .get()); - phi::Allocation* allocation = npu_pinned_tensor.Holder().get(); - npu_pinned_allocator->RecordEvent( - allocation, - reinterpret_cast(ctx).stream()); - } -#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE else if (platform::is_custom_place(dst_place)) { // NOLINT auto stream = @@ -433,11 +332,6 @@ void TensorToVector(const phi::DenseTensor& src, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); - } -#endif #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(src.place())) { // NOLINT memory::Copy( @@ -491,11 +385,6 @@ inline void TensorToVector(const phi::DenseTensor& src, memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); - } -#endif #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(src.place())) { // NOLINT memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); @@ -566,11 +455,6 @@ inline T GetValue(const phi::DenseTensor* x) { if (!platform::is_cpu_place(x->place())) { phi::DenseTensor cpu_x; framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x); -#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - const platform::DeviceContext* dev_ctx = pool.Get(x->place()); - dev_ctx->Wait(); -#endif value = cpu_x.data()[0]; } else { value = x->data()[0]; diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index 9097c43023bd2..bda2681f57f31 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -299,32 +299,6 @@ TEST(TensorToVector, Tensor_bool) { } } #endif -#ifdef PADDLE_WITH_ASCEND_CL - { - std::vector src_vec = { - false, - true, - false, - true, - false, - true, - false, - true, - false, - }; - phi::DenseTensor npu_tensor; - paddle::platform::NPUPlace place(0); - paddle::platform::NPUDeviceContext npu_ctx(place); - paddle::framework::TensorFromVector(src_vec, npu_ctx, &npu_tensor); - - std::vector dst; - paddle::framework::TensorToVector(npu_tensor, npu_ctx, &dst); - - for (int i = 0; i < 3 * 3; ++i) { - EXPECT_EQ(src_vec[i], dst[i]); - } - } -#endif } TEST(TensorFromDLPack, Tensor) { diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 08696e4112db9..455487541abb9 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -302,8 +302,7 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index 48ea9143d621a..16aa069a0c33a 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -82,8 +82,7 @@ REGISTER_TRAINER_CLASS(HeterXpuTrainer); (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(PSGPUTrainer); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif } // namespace framework diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 1b43bd25eeef0..961b7c1e663c0 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -65,28 +65,6 @@ using Attribute = paddle::variant>; using AttributeMap = std::unordered_map; -#ifdef PADDLE_WITH_ASCEND_CL -using NPUAttribute = paddle::variant, - std::vector, - std::vector, - bool, - std::vector, - BlockDesc*, - int64_t, - std::vector, - std::vector, - std::vector, - VarDesc*, - std::vector, - std::vector>>; - -using NPUAttributeMap = std::unordered_map; -#endif - using OpCreator = std::function -#include -#endif - #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -69,10 +64,6 @@ class Communicator; class NCCLCommunicator; #endif #endif -#ifdef PADDLE_WITH_ASCEND_CL -class Communicator; -class HCCLCommunicator; -#endif #if defined(PADDLE_WITH_XPU_BKCL) class BKCLCommunicator; @@ -205,9 +196,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #endif operators::CudnnRNNCache, #endif -#if defined(PADDLE_WITH_ASCEND_CL) - HcclRootInfo, -#endif #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 503ea531f171a..3e0d3348bc790 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -36,49 +36,6 @@ namespace paddle { namespace inference { namespace analysis { -#ifdef PADDLE_WITH_ASCEND_CL -void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { - if (!argument->use_npu()) return; - - auto &graph = argument->main_graph(); - std::vector repetitive_params; - - if (graph.Has(framework::ir::kRepetitiveParamAttr)) - repetitive_params = graph.Get>( - framework::ir::kRepetitiveParamAttr); - - LOG(INFO) << "Sync params from CPU to NPU"; - - PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(), - true, - platform::errors::PreconditionNotMet( - "The npu_device_id field should be valid")); - platform::Place place = platform::NPUPlace(argument->npu_device_id()); - auto *scope = argument->scope_ptr(); - std::vector all_vars = scope->LocalVarNames(); - - for (auto &var_name : all_vars) { - auto *var = scope->FindLocalVar(var_name); - PADDLE_ENFORCE_NOT_NULL( - var, - platform::errors::PreconditionNotMet("The var should not be nullptr")); - - if (var->IsType()) { - auto *t = var->GetMutable(); - - platform::CPUPlace cpu_place; - phi::DenseTensor temp_tensor; - temp_tensor.Resize(t->dims()); - temp_tensor.mutable_data(cpu_place); - - paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); - t->clear(); - paddle::framework::TensorCopySync(temp_tensor, place, t); - } - } -} -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. @@ -253,11 +210,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { argument->scope_valid(), true, platform::errors::PreconditionNotMet("The scope field should be valid")); -#ifdef PADDLE_WITH_ASCEND_CL - if (argument->use_npu_valid()) { - CopyParamsToNpu(argument); - } -#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (argument->use_gpu_valid()) { CopyParamsToGpu(argument); diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index 3ffecc72a50f5..9db17abc24d2a 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -35,10 +35,6 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { std::string repr() const override; private: -#ifdef PADDLE_WITH_ASCEND_CL - void CopyParamsToNpu(Argument *argument); -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void CopyParamsToGpu(Argument *argument); #endif diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 7ae0d0f636588..b0f53c1f639ac 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -195,21 +195,6 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) { Update(); } -void AnalysisConfig::EnableNpu(int device_id) { -#if defined(PADDLE_WITH_ASCEND_CL) - use_npu_ = true; - npu_device_id_ = device_id; -#elif defined(PADDLE_WITH_CUSTOM_DEVICE) - use_custom_device_ = true; - custom_device_id_ = device_id; - custom_device_type_ = "npu"; -#else - LOG(ERROR) << "Please compile with npu to EnableNpu()"; - use_npu_ = false; -#endif - Update(); -} - void AnalysisConfig::EnableCustomDevice(const std::string &device_type, int device_id, Precision precision_mode) { @@ -1023,20 +1008,6 @@ void AnalysisConfig::Update() { "with XPU-runtime.")); #endif } - - if (use_npu_) { -#if defined(PADDLE_WITH_ASCEND_CL) || defined(LITE_SUBGRAPH_WITH_NPU) - PADDLE_ENFORCE_EQ(use_gpu_, - false, - platform::errors::Unavailable( - "Currently, NPU and GPU cannot be enabled in the " - "same analysis configuration.")); -#else - PADDLE_THROW(platform::errors::Unavailable( - "You tried to use an NPU device, but Paddle was not compiled " - "with NPU-runtime.")); -#endif - } if (use_ipu_) { #ifndef PADDLE_WITH_IPU PADDLE_THROW(platform::errors::Unavailable( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 91dee8a9ae4ee..ce47e9ff5e48e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -376,14 +376,6 @@ void AnalysisPredictor::InitPlace() { "with WITH_XPU.")); #endif // PADDLE_WITH_XPU } - } else if (config_.use_npu()) { -#ifdef PADDLE_WITH_ASCEND_CL - place_ = paddle::platform::NPUPlace(config_.npu_device_id()); -#else - PADDLE_THROW(platform::errors::Unavailable( - "You tried to use NPU forward propagation, but Paddle was not compiled " - "with WITH_ASCEND_CL.")); -#endif } else if (config_.NNAdapter().use_nnadapter) { if (config_.lite_engine_enabled()) { place_ = paddle::platform::CPUPlace(); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 3a77c1b878aba..0d5c8f98020a8 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -278,23 +278,6 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, #else PADDLE_THROW(platform::errors::Unavailable( "Not compile with XPU, should not reach here.")); -#endif - } else { -#ifdef PADDLE_WITH_ASCEND_CL - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto *dev_ctx = - static_cast(pool.Get(place_)); - auto dst_npu_place = place_; - memory::Copy(dst_npu_place, - static_cast(input_ptr), - platform::CPUPlace(), - inputs[i].data.data(), - inputs[i].data.length(), - dev_ctx->stream()); -#else - PADDLE_THROW(platform::errors::Unavailable( - "Not compile with NPU, should not reach here.")); #endif } diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 67dc193feed09..1416dacb833d9 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -305,15 +305,6 @@ TEST(inference_api_native, image_classification_xpu) { } #endif -#ifdef PADDLE_WITH_ASCEND_CL -TEST(inference_api_native, word2vec_npu) { - MainWord2Vec(paddle::PaddlePlace::kNPU); -} -// TEST(inference_api_native, image_classification_npu) { -// MainImageClassification(paddle::PaddlePlace::kNPU); -// } -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(paddle::PaddlePlace::kGPU); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 0a0a27bb6a6a0..52204ff3658f4 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -244,25 +244,6 @@ void Tensor::CopyFromCpu(const T *data) { PADDLE_THROW(paddle::platform::errors::Unavailable( "Can not create tensor with XPU place because paddle is not compiled " "with XPU.")); -#endif - } else if (place_ == PlaceType::kNPU) { -#ifdef PADDLE_WITH_ASCEND_CL - paddle::platform::DeviceContextPool &pool = - paddle::platform::DeviceContextPool::Instance(); - paddle::platform::NPUPlace npu_place(device_); - auto *t_data = tensor->mutable_data(npu_place); - auto *dev_ctx = static_cast( - pool.Get(npu_place)); - paddle::memory::Copy(npu_place, - static_cast(t_data), - paddle::platform::CPUPlace(), - data, - ele_size, - dev_ctx->stream()); -#else - PADDLE_THROW(paddle::platform::errors::Unavailable( - "Can not create tensor with NPU place because paddle is not compiled " - "with NPU.")); #endif } else { #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -468,25 +449,6 @@ void Tensor::CopyToCpuImpl(T *data, PADDLE_THROW(paddle::platform::errors::Unavailable( "Can not create tensor with XPU place because paddle is not compiled " "with XPU.")); -#endif - } else if (place_ == PlaceType::kNPU) { -#ifdef PADDLE_WITH_ASCEND_CL - paddle::platform::DeviceContextPool &pool = - paddle::platform::DeviceContextPool::Instance(); - auto npu_place = t_place; - auto *dev_ctx = static_cast( - pool.Get(npu_place)); - paddle::memory::Copy(paddle::platform::CPUPlace(), - static_cast(data), - npu_place, - t_data, - ele_num * sizeof(T), - dev_ctx->stream()); - paddle::platform::NPUStreamSync(dev_ctx->stream()); -#else - PADDLE_THROW(paddle::platform::errors::Unavailable( - "Can not create tensor with NPU place because paddle is not compiled " - "with NPU.")); #endif } else { #ifdef PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc index 1a6f1a2669b89..c3589f4251791 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc @@ -146,10 +146,6 @@ TEST(Tensor, FillRandomDataAndCheck) { ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kGPU)); ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kGPU)); #endif -#ifdef PADDLE_WITH_ASCEND_CL - ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kNPU)); - ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kNPU)); -#endif #ifdef PADDLE_WITH_XPU ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kXPU)); ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kXPU)); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 63401f2fec6cf..585f12e4d07d7 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -363,12 +363,6 @@ struct PD_INFER_DECL AnalysisConfig { /// void SetXpuDeviceId(int device_id = 0); /// - /// \brief Turn on NPU. - /// - /// \param device_id device_id the NPU card to use (default is 0). - /// - void EnableNpu(int device_id = 0); - /// /// \brief Turn on CustomDevice. /// /// \param device_type device_type the custom device to use. diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index dd0979274f75d..9b19874e0b907 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -171,11 +171,6 @@ void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, enable_multi_stream); } -void PD_ConfigEnableNpu(__pd_keep PD_Config* pd_config, int32_t device_id) { - CHECK_AND_CONVERT_PD_CONFIG; - config->EnableNpu(device_id); -} - PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) { CHECK_AND_CONVERT_PD_CONFIG; return config->use_xpu(); diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index 19e1a1c139d4c..a2e050f9f7306 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -214,14 +214,6 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( PD_Bool adaptive_seqlen, PD_Bool enable_multi_stream); /// -/// \brief Turn on NPU. -/// -/// \param[in] pd_onfig config -/// \param[in] device_id device_id the NPU card to use. -/// -PADDLE_CAPI_EXPORT extern void PD_ConfigEnableNpu( - __pd_keep PD_Config* pd_config, int32_t device_id); -/// /// \brief A boolean state telling whether the XPU is turned on. /// /// \param[in] pd_onfig config diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go index 72c5ab078c83d..c24b941e33e3c 100644 --- a/paddle/fluid/inference/goapi/config.go +++ b/paddle/fluid/inference/goapi/config.go @@ -212,15 +212,6 @@ func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune boo cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen), cvtGoBoolToPD(enableMultiStream)) } -/// -/// \brief Turn on NPU. -/// -/// \param deviceId the NPU card to use. -/// -func (config *Config) EnableNpu(deviceId int32) { - C.PD_ConfigEnableNpu(config.c, C.int32_t(deviceId)) -} - /// /// \brief A boolean state telling whether the GPU is turned on. /// diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index f7c57fa2b02d6..4dc408241f476 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -50,11 +50,6 @@ if(UNIX AND NOT APPLE) list(APPEND ALLOCATOR_DEPS rt) endif() -if(WITH_ASCEND_CL) - list(APPEND ALLOCATOR_SRCS npu_allocator.cc npu_pinned_allocator.cc) - list(APPEND ALLOCATOR_DEPS npu_info) -endif() - if(WITH_CUSTOM_DEVICE) list(APPEND ALLOCATOR_SRCS custom_allocator.cc) endif() diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 029288f153923..42b331298ffa0 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -54,10 +54,6 @@ #include "paddle/fluid/platform/device/xpu/xpu_info.h" #endif -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" -#endif - #ifdef PADDLE_WITH_IPU #include "paddle/fluid/platform/device/ipu/ipu_info.h" #endif @@ -198,12 +194,6 @@ class AllocatorFacadePrivate { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { - InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); - } - InitNaiveBestFitNPUPinnedAllocator(); -#endif #ifdef PADDLE_WITH_MLU for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) { InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id)); @@ -254,12 +244,6 @@ class AllocatorFacadePrivate { InitNaiveBestFitCUDAPinnedAllocator(); #endif -#ifdef PADDLE_WITH_ASCEND_CL - for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { - InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); - } - InitNaiveBestFitNPUPinnedAllocator(); -#endif #ifdef PADDLE_WITH_XPU for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); @@ -823,17 +807,6 @@ class AllocatorFacadePrivate { } #endif -#ifdef PADDLE_WITH_ASCEND_CL - void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { - allocators_[p] = std::make_shared(p); - } - - void InitNaiveBestFitNPUPinnedAllocator() { - allocators_[platform::NPUPinnedPlace()] = - std::make_shared(); - } -#endif - #ifdef PADDLE_WITH_CUSTOM_DEVICE void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) { allocators_[p] = std::make_shared(p); @@ -915,12 +888,6 @@ class AllocatorFacadePrivate { places.emplace_back(platform::XPUPlace(dev_id)); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - int device_count = platform::GetNPUDeviceCount(); - for (int dev_id = 0; dev_id < device_count; ++dev_id) { - places.emplace_back(platform::NPUPlace(dev_id)); - } -#endif #ifdef PADDLE_WITH_IPU int device_count = platform::GetIPUDeviceCount(); for (int dev_id = 0; dev_id < device_count; ++dev_id) { @@ -1107,7 +1074,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, } else { return m->GetAllocator(p, size)->Allocate(size); } -#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) +#elif defined(PADDLE_WITH_XPU) return GetAllocator(place)->Allocate(size); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 3ed758219783c..7f10b2286b4e7 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -16,9 +16,6 @@ #include #include "paddle/fluid/memory/allocation/allocator.h" -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" -#endif #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -29,10 +26,6 @@ namespace paddle { namespace memory { namespace allocation { -#ifdef PADDLE_WITH_ASCEND_CL -using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator; -#endif - // Allocator Facade is the interface exposed to other modules. // All the configuration or dirty code under development should // be hidden behind this facade. diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc index 907fd37e44205..9a43da132086c 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator.cc @@ -19,8 +19,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "glog/logging.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #define USE_DEVICE DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif @@ -57,9 +56,6 @@ BuddyAllocator::BuddyAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) init_allocate_size_func_ = &platform::GpuInitAllocSize; re_allocate_size_func_ = &platform::GpuReallocSize; -#elif defined(PADDLE_WITH_ASCEND_CL) - init_allocate_size_func_ = &platform::NPUInitAllocSize; - re_allocate_size_func_ = &platform::NPUReallocSize; #elif defined(PADDLE_WITH_MLU) init_allocate_size_func_ = &platform::MLUInitAllocSize; re_allocate_size_func_ = &platform::MLUReallocSize; @@ -257,9 +253,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allocate_bytes = DeviceAllocateSize( &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); -#elif defined(PADDLE_WITH_ASCEND_CL) - allocate_bytes = DeviceAllocateSize( - &platform::NPUInitAllocSize, &platform::NPUReallocSize, request_bytes); #elif defined(PADDLE_WITH_MLU) allocate_bytes = DeviceAllocateSize( &platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes); diff --git a/paddle/fluid/memory/allocation/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc index 315d6649c5d77..e69e773a15f67 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc @@ -29,8 +29,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/mlu/mlu_info.h" #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); @@ -396,34 +395,6 @@ TEST(BuddyAllocator, Release) { } #endif -#ifdef PADDLE_WITH_ASCEND_CL -TEST(BuddyAllocator, NpuFraction) { - // In a 16 GB machine, the pool size will be about 160 MB - FLAGS_fraction_of_gpu_memory_to_use = 0.92; - FLAGS_initial_gpu_memory_in_mb = 0; - FLAGS_reallocate_gpu_memory_in_mb = 0; - - BuddyAllocator buddy_allocator( - std::unique_ptr(new NPUAllocator(0)), - platform::NPUMinChunkSize(), - platform::NPUMaxChunkSize()); - - // Less than pool size - TestBuddyAllocator(&buddy_allocator, 10); - TestBuddyAllocator(&buddy_allocator, 10 << 10); - TestBuddyAllocator(&buddy_allocator, 10 << 20); - buddy_allocator.Release(); - - // Greater than max chunk size - TestBuddyAllocator(&buddy_allocator, - 300 << 20, - /* use_system_allocator = */ true); - TestBuddyAllocator(&buddy_allocator, - 1 * static_cast(1 << 30), - /* use_system_allocator = */ true); -} -#endif - #ifdef PADDLE_WITH_MLU TEST(BuddyAllocator, MluFraction) { // In a 16 GB machine, the pool size will be about 160 MB diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 4bcfdb1aaf424..a6c8d2f3bdb03 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -213,210 +213,6 @@ size_t Used(const platform::XPUPlace &place) { #endif } -// For Ascend NPU -#ifdef PADDLE_WITH_ASCEND_CL -constexpr int EXTRA_PADDING_SIZE = 32; -class NPUBuddyAllocatorList { - private: - NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) { - auto npu_num = devices_.size(); - allocators_.resize(npu_num); - init_flags_.reserve(npu_num); - for (size_t i = 0; i < npu_num; ++i) { - init_flags_.emplace_back(new std::once_flag()); - } - } - - static NPUBuddyAllocatorList *CreateNewInstance() { - return new NPUBuddyAllocatorList(); - } - - public: - static NPUBuddyAllocatorList *Instance() { - static auto *instance = CreateNewInstance(); - return instance; - } - - BuddyAllocator *Get(int npu_id) { - auto pos = std::distance( - devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); - PADDLE_ENFORCE_LT(pos, - devices_.size(), - platform::errors::OutOfRange( - "The index exceeds the size of devices, the size of " - "devices is %d, the index is %d", - devices_.size(), - pos)); - - std::call_once(*init_flags_[pos], [this, pos] { - platform::SetNPUDeviceId(devices_[pos]); - allocators_[pos].reset( - new BuddyAllocator(std::unique_ptr( - new detail::NPUAllocator(devices_[pos])), - platform::NPUMinChunkSize(), - platform::NPUMaxChunkSize(), - EXTRA_PADDING_SIZE)); - VLOG(10) << "\n\nNOTE:\n" - << "You can set GFlags environment variable " - << "'FLAGS_fraction_of_gpu_memory_to_use' " - << "or 'FLAGS_initial_gpu_memory_in_mb' " - << "or 'FLAGS_reallocate_gpu_memory_in_mb' " - << "to change the memory size for GPU usage.\n" - << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " - << FLAGS_fraction_of_gpu_memory_to_use - << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " - << FLAGS_initial_gpu_memory_in_mb - << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " - << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; - }); - - return allocators_[pos].get(); - } - - private: - std::vector devices_; - std::vector> init_flags_; - std::vector> allocators_; -}; - -BuddyAllocator *GetNPUBuddyAllocator(int npu_id) { - return NPUBuddyAllocatorList::Instance()->Get(npu_id); -} - -BuddyAllocator *GetNPUPinnedBuddyAllocator() { - static std::once_flag init_flag; - static BuddyAllocator *ba = nullptr; - - std::call_once(init_flag, []() { - ba = new BuddyAllocator(std::unique_ptr( - new detail::NPUPinnedAllocator), - phi::backends::cpu::NPUPinnedMinChunkSize(), - phi::backends::cpu::NPUPinnedMaxChunkSize()); - }); - - return ba; -} - -#endif - -template <> -size_t Used(const platform::NPUPlace &place) { -#ifdef PADDLE_WITH_ASCEND_CL - return GetNPUBuddyAllocator(place.device)->Used(); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "'NPUPlace' is not supported in CPU only device.")); -#endif -} - -template <> -void *Alloc(const platform::NPUPlace &place, size_t size) { -#ifdef PADDLE_WITH_ASCEND_CL - auto *buddy_allocator = GetNPUBuddyAllocator(place.device); - auto *ptr = buddy_allocator->Alloc(size); - if (ptr == nullptr) { - platform::NPUDeviceGuard(place.device); - size_t avail, total; - platform::NPUMemoryUsage(&avail, &total); - PADDLE_THROW(platform::errors::ResourceExhausted( - "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize " - "%s, NpuMaxChunkSize %s, NPU memory used: %s.", - string::HumanReadableSize(size), - place.device, - string::HumanReadableSize(avail), - string::HumanReadableSize(total), - string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), - string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), - string::HumanReadableSize(Used(place)))); - } else { - if (FLAGS_init_allocated_mem) { - platform::NPUMemsetSync(ptr, 0xEF, size, size); - } - } - VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); - return ptr; -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "'NPUPlace' is not supported in CPU only device.")); -#endif -} - -template <> -void Free(const platform::NPUPlace &place, - void *p, - size_t size) { -#ifdef PADDLE_WITH_ASCEND_CL - VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); - GetNPUBuddyAllocator(place.device)->Free(p); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "'NPUPlace' is not supported in CPU only device.")); -#endif -} - -template <> -uint64_t Release(const platform::NPUPlace &place) { -#ifdef PADDLE_WITH_ASCEND_CL - return GetNPUBuddyAllocator(place.device)->Release(); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "'NPUPlace' is not supported in CPU only device.")); -#endif -} - -template <> -size_t Used(const platform::NPUPinnedPlace &place) { -#ifdef PADDLE_WITH_ASCEND_CL - return GetNPUPinnedBuddyAllocator()->Used(); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "'NPUPinnedPlace' is not supported in CPU only device.")); -#endif -} - -template <> -void *Alloc(const platform::NPUPinnedPlace &place, - size_t size) { -#ifdef PADDLE_WITH_ASCEND_CL - auto *buddy_allocator = GetNPUPinnedBuddyAllocator(); - void *ptr = buddy_allocator->Alloc(size); - - if (ptr == nullptr) { - LOG(WARNING) << "Cannot allocate " << size << " bytes in NPUPinnedPlace"; - } - if (FLAGS_init_allocated_mem) { - memset(ptr, 0xEF, size); - } - return ptr; -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "'NPUPinnedPlace' is not supported in CPU only device.")); -#endif -} - -template <> -void Free(const platform::NPUPinnedPlace &place, - void *p, - size_t size) { -#ifdef PADDLE_WITH_ASCEND_CL - GetNPUPinnedBuddyAllocator()->Free(p); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "'NPUPinnedPlace' is not supported in CPU only device.")); -#endif -} - -template <> -uint64_t Release( - const platform::NPUPinnedPlace &place) { -#ifdef PADDLE_WITH_ASCEND_CL - return GetNPUPinnedBuddyAllocator()->Release(); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "'NPUPinnedPlace' is not supported in CPU only device.")); -#endif -} - // For CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUBuddyAllocatorList { diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc index 7d5cb5200a6a4..6f4f901d986fd 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -61,22 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { } #endif -#ifdef PADDLE_WITH_ASCEND_CL -TEST(NaiveBestFitAllocatorTest, NpuAlloc) { - NaiveBestFitAllocator alloc{platform::NPUPlace(0)}; - { - size_t size = (1 << 20); - auto allocation = alloc.Allocate(size); - } - sleep(10); - alloc.Release(platform::NPUPlace(0)); - - size_t size = (1 << 20); - auto allocation = alloc.Allocate(size); - alloc.Release(platform::NPUPlace(0)); -} -#endif - #ifdef PADDLE_WITH_MLU TEST(NaiveBestFitAllocatorTest, MluAlloc) { NaiveBestFitAllocator alloc{platform::MLUPlace(0)}; diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc deleted file mode 100644 index a4f253ba657e9..0000000000000 --- a/paddle/fluid/memory/allocation/npu_allocator.cc +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/memory/allocation/npu_allocator.h" - -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace memory { -namespace allocation { - -bool NPUAllocator::IsAllocThreadSafe() const { return true; } -void NPUAllocator::FreeImpl(phi::Allocation* allocation) { - PADDLE_ENFORCE_EQ( - allocation->place(), - place_, - platform::errors::PermissionDenied( - "NPU memory is freed in incorrect device. This may be a bug")); - platform::RecordedNPUFree( - allocation->ptr(), allocation->size(), place_.device); - delete allocation; -} - -phi::Allocation* NPUAllocator::AllocateImpl(size_t size) { - std::call_once(once_flag_, - [this] { platform::SetNPUDeviceId(place_.device); }); - - void* ptr; - auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device); - if (LIKELY(result == ACL_ERROR_NONE)) { - return new Allocation(ptr, size, platform::Place(place_)); - } - - size_t avail, total, actual_avail, actual_total; - bool is_limited = platform::RecordedNPUMemGetInfo( - &avail, &total, &actual_avail, &actual_total, place_.device); - - std::string err_msg; - if (is_limited) { - auto limit_size = (total >> 20); - err_msg = string::Sprintf( - "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger " - "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum " - "GPU memory usage is limited to %d MB.\n" - " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", - limit_size, - limit_size); - } - - PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( - "\n\nOut of memory error on NPU %d. " - "Cannot allocate %s memory on NPU %d, " - "available memory is only %s.\n\n" - "Please check whether there is any other process using NPU %d.\n" - "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" - "2. If no, please decrease the batch size of your model. %s\n\n", - place_.device, - string::HumanReadableSize(size), - place_.device, - string::HumanReadableSize(avail), - place_.device, - err_msg)); -} - -} // namespace allocation -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h deleted file mode 100644 index 04832c6fd9b63..0000000000000 --- a/paddle/fluid/memory/allocation/npu_allocator.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include // NOLINT - -#include "paddle/fluid/memory/allocation/allocator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace memory { -namespace allocation { - -class NPUAllocator : public Allocator { - public: - explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {} - - bool IsAllocThreadSafe() const override; - - protected: - void FreeImpl(phi::Allocation* allocation) override; - phi::Allocation* AllocateImpl(size_t size) override; - - private: - platform::NPUPlace place_; - std::once_flag once_flag_; -}; - -} // namespace allocation -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc deleted file mode 100644 index db76cbaace4c3..0000000000000 --- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" - -namespace paddle { -namespace memory { -namespace allocation { - -void NPUPinnedAllocator::ProcessEventsAndFree() { - for (auto it = npu_events_.begin(); it != npu_events_.end();) { - aclrtEvent event = it->second; - aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; - platform::NPUEventQuery(event, &status); - - if (status == ACL_EVENT_STATUS_COMPLETE) { - auto *allocation = it->first; - void *ptr = allocation->ptr(); - free(ptr); - npu_events_.erase(it++); - delete allocation; - platform::NPUEventDestroy(event); - } else { - ++it; - } - } -} - -phi::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { - std::lock_guard lock(mtx_); - ProcessEventsAndFree(); - void *ptr; - int error = posix_memalign(&ptr, kAlignment, size); - PADDLE_ENFORCE_EQ( - error, - 0, - platform::errors::ResourceExhausted( - "Fail to alloc memory of %ld size, error code is %d.", size, error)); - return new Allocation(ptr, size, platform::NPUPinnedPlace()); -} - -void NPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { - std::lock_guard lock(mtx_); - void *ptr = allocation->ptr(); - auto iter = npu_events_.find(allocation); - - // Managed by GC if not called RecordEvent. - if (iter == npu_events_.end()) { - // double free? No such problem has been found so far. - // Or maybe we need a set to record which - // Allocation managed by GC. - free(ptr); - delete allocation; - return; - } - - aclrtEvent event = iter->second; - aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; - platform::NPUEventQuery(event, &status); - if (status == ACL_EVENT_STATUS_COMPLETE) { - free(ptr); - npu_events_.erase(allocation); - delete allocation; - platform::NPUEventDestroy(event); - } - return; -} - -uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) { - std::lock_guard lock(mtx_); - // Empty implementation - return static_cast(0); -} - -void NPUPinnedAllocator::RecordEvent(phi::Allocation *allocation, - aclrtStream stream) { - std::lock_guard lock(mtx_); - aclrtEvent event = nullptr; - platform::NPUEventCreate(&event); - platform::NPUEventRecord(event, stream); - npu_events_.insert({allocation, event}); -} - -} // namespace allocation -} // namespace memory -} // namespace paddle -#endif diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h deleted file mode 100644 index 80d545e507ec3..0000000000000 --- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_ASCEND_CL -#include // NOLINT -#include -#include - -#include "acl/acl.h" -#include "paddle/fluid/memory/allocation/allocator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace memory { -namespace allocation { - -class NPUPinnedAllocator : public Allocator { - public: - bool IsAllocThreadSafe() const override { return true; } - void ProcessEventsAndFree(); - void RecordEvent(phi::Allocation *allocation, aclrtStream stream); - constexpr static size_t kAlignment = 4096UL; - - protected: - phi::Allocation *AllocateImpl(size_t size) override; - void FreeImpl(phi::Allocation *allocation) override; - uint64_t ReleaseImpl(const platform::Place &place) override; - - private: - std::unordered_map npu_events_; - mutable std::mutex mtx_; -}; - -} // namespace allocation -} // namespace memory -} // namespace paddle - -#endif diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index 6818c3c0f5593..ddd916817e756 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -287,135 +287,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } #endif -#ifdef PADDLE_WITH_ASCEND_CL -void* NPUAllocator::Alloc(size_t* index, size_t size) { - if (size <= 0) return nullptr; - - void* p; - auto result = platform::RecordedNPUMalloc(&p, size, npu_id_); - - if (result == ACL_ERROR_NONE) { - *index = 0; - npu_alloc_size_ += size; - return p; - } else { - size_t avail, total, actual_avail, actual_total; - bool is_limited = platform::RecordedNPUMemGetInfo( - &avail, &total, &actual_avail, &actual_total, npu_id_); - - std::string err_msg; - if (is_limited) { - auto limit_size = (total >> 20); - err_msg = string::Sprintf( - "\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a " - "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " - "maximum GPU memory usage is limited to %d MB.\n" - " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", - limit_size, - limit_size); - } - - PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( - "\n\nOut of memory error on NPU %d. " - "Cannot allocate %s memory on NPU %d, " - "available memory is only %s.\n\n" - "Please check whether there is any other process using NPU %d.\n" - "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" - "2. If no, please try one of the following suggestions:\n" - " 1) Decrease the batch size of your model.\n" - " 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, " - "please set it to a higher value but less than 1.0.\n" - " The command is " - "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", - npu_id_, - string::HumanReadableSize(size), - npu_id_, - string::HumanReadableSize(avail), - npu_id_, - FLAGS_fraction_of_gpu_memory_to_use, - err_msg)); - } -} - -void NPUAllocator::Free(void* p, size_t size, size_t index) { - VLOG(4) << "Free " << p << " size " << size; - PADDLE_ENFORCE_EQ(index, - 0, - platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(npu_alloc_size_, - size, - platform::errors::InvalidArgument( - "The size of memory (%d) to free exceeds the size of " - "allocated gpu memory (%d)", - size, - npu_alloc_size_)); - npu_alloc_size_ -= size; - - platform::RecordedNPUFree(p, size, npu_id_); -} - -bool NPUAllocator::UseGpu() const { return true; } - -void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { - if (size <= 0) return nullptr; - - size_t usable = - phi::backends::cpu::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_; - - if (size > usable) { - LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 - << " MB pinned memory." - << ", available " << usable / 1024.0 / 1024.0 << " MB"; - return nullptr; - } - - void* p; - // PINNED memory is visible to all NPU contexts. - auto result = platform::NPUHostMalloc(&p, size); - - if (result == ACL_ERROR_NONE) { - *index = 1; // PINNED memory - npu_pinnd_alloc_size_ += size; - return p; - } else { - LOG(WARNING) << "NPUHostMalloc failed."; - return nullptr; - } - - return nullptr; -} - -void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) { - aclError err; - PADDLE_ENFORCE_EQ(index, - 1, - platform::errors::InvalidArgument( - "The index should be 1, but got %d", index)); - - PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, - size, - platform::errors::InvalidArgument( - "The size of memory (%d) to free exceeds the size of " - "allocated npu pinned memory (%d)", - size, - npu_pinnd_alloc_size_)); - npu_pinnd_alloc_size_ -= size; - err = platform::NPUHostFree(p); - - if (err != ACL_ERROR_NONE) { - PADDLE_ENFORCE_EQ( - err, - 0, - platform::errors::Fatal( - "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err)); - } -} - -bool NPUPinnedAllocator::UseGpu() const { return false; } - -#endif - #ifdef PADDLE_WITH_MLU void* MLUAllocator::Alloc(size_t* index, size_t size) { if (size <= 0) return nullptr; diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h index 18c2e278f99c5..bb1a4ee998174 100644 --- a/paddle/fluid/memory/allocation/system_allocator.h +++ b/paddle/fluid/memory/allocation/system_allocator.h @@ -68,32 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator { }; #endif -#ifdef PADDLE_WITH_ASCEND_CL - -class NPUAllocator : public SystemAllocator { - public: - explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {} - - virtual void* Alloc(size_t* index, size_t size); - virtual void Free(void* p, size_t size, size_t index); - virtual bool UseGpu() const; - - private: - size_t npu_alloc_size_ = 0; - int npu_id_; -}; - -class NPUPinnedAllocator : public SystemAllocator { - public: - virtual void* Alloc(size_t* index, size_t size); - virtual void Free(void* p, size_t size, size_t index); - virtual bool UseGpu() const; - - private: - size_t npu_pinnd_alloc_size_ = 0; -}; -#endif - #ifdef PADDLE_WITH_MLU class MLUAllocator : public SystemAllocator { public: diff --git a/paddle/fluid/memory/allocation/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc index 4749ff3f8adb7..d20e3a1d6c9d0 100644 --- a/paddle/fluid/memory/allocation/system_allocator_test.cc +++ b/paddle/fluid/memory/allocation/system_allocator_test.cc @@ -83,14 +83,6 @@ TEST(GPUAllocator, AllocFailure) { } #endif -#ifdef PADDLE_WITH_ASCEND_CL -TEST(NPUAllocator, Alloc) { - paddle::memory::detail::NPUAllocator a(0); - TestAllocator(&a, 1 << 20); - TestAllocator(&a, 1); -} -#endif - #ifdef PADDLE_WITH_MLU TEST(MLUAllocator, Alloc) { paddle::memory::detail::MLUAllocator a(0); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index f3fb1fdf5ab55..0c5a0fef7172a 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -260,415 +260,6 @@ void Copy(phi::Place dst_place, #endif -#ifdef PADDLE_WITH_ASCEND_CL -template <> -void Copy(platform::NPUPlace dst_place, - void* dst, - platform::CPUPlace src_place, - const void* src, - size_t num, - void* stream) { - if (UNLIKELY(num == 0)) return; - - platform::SetNPUDeviceId(dst_place.device); - - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; - - if (stream) { - platform::RecordEvent record_event( - "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, - src, - num, - ACL_MEMCPY_HOST_TO_DEVICE, - reinterpret_cast(stream)); - } else { - // On NPU, async operation after sync operation is ok, while sync operation - // after async is not ok, since the async operation may not done. - // So, its needed to do wait before sync operation. - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - static_cast(pool.Get(dst_place))->Wait(); - - platform::RecordEvent record_event( - "NpuMemcpySync:CPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); - } -} - -template <> -void Copy(platform::CPUPlace dst_place, - void* dst, - platform::NPUPlace src_place, - const void* src, - size_t num, - void* stream) { - if (UNLIKELY(num == 0)) return; - - platform::SetNPUDeviceId(src_place.device); - - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; - - if (stream) { - platform::RecordEvent record_event( - "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, - src, - num, - ACL_MEMCPY_DEVICE_TO_HOST, - reinterpret_cast(stream)); - } else { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - static_cast(pool.Get(src_place))->Wait(); - - platform::RecordEvent record_event( - "NpuMemcpySync:NPU->CPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); - } -} - -template <> -void Copy(platform::NPUPlace dst_place, - void* dst, - platform::NPUPlace src_place, - const void* src, - size_t num, - void* stream) { - if (UNLIKELY(num == 0)) return; - - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by stream(" << stream << ")"; - if (dst_place == src_place) { - platform::SetNPUDeviceId(src_place.device); - if (stream) { - platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU", - platform::TracerEventType::UserDefined, - 1); - platform::NPUMemcpyAsync(dst, - src, - num, - ACL_MEMCPY_DEVICE_TO_DEVICE, - reinterpret_cast(stream)); - } else { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - static_cast(pool.Get(dst_place))->Wait(); - - platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU", - platform::TracerEventType::UserDefined, - 1); - platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); - } - } else { - if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) { - PADDLE_THROW(platform::errors::Unavailable( - "Peer access between NPU places is not allowed.")); - } - if (stream) { - // TODO(zhiqiu): support peer access? - platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU", - platform::TracerEventType::UserDefined, - 1); - platform::NPUMemcpyAsync(dst, - src, - num, - ACL_MEMCPY_DEVICE_TO_DEVICE, - reinterpret_cast(stream)); - } else { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - static_cast(pool.Get(dst_place))->Wait(); - - platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU", - platform::TracerEventType::UserDefined, - 1); - platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); - } - } -} - -template <> -void Copy( - platform::CPUPlace dst_place, - void* dst, - platform::NPUPinnedPlace src_place, - const void* src, - size_t num) { - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place; - if (UNLIKELY(num == 0)) return; - std::memcpy(dst, src, num); -} - -template <> -void Copy( - platform::NPUPinnedPlace dst_place, - void* dst, - platform::CPUPlace src_place, - const void* src, - size_t num) { - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place; - if (UNLIKELY(num == 0)) return; - std::memcpy(dst, src, num); -} - -template <> -void Copy( - platform::NPUPinnedPlace dst_place, - void* dst, - platform::NPUPinnedPlace src_place, - const void* src, - size_t num) { - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place; - if (UNLIKELY(num == 0)) return; - std::memcpy(dst, src, num); -} - -template <> -void Copy( - platform::NPUPinnedPlace dst_place, - void* dst, - platform::NPUPlace src_place, - const void* src, - size_t num, - void* stream) { - if (UNLIKELY(num == 0)) return; - - platform::SetNPUDeviceId(src_place.device); - - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; - - if (stream) { - platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned", - platform::TracerEventType::UserDefined, - 1); - platform::NPUMemcpyAsync(dst, - src, - num, - ACL_MEMCPY_DEVICE_TO_HOST, - reinterpret_cast(stream)); - } else { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - static_cast(pool.Get(src_place))->Wait(); - - platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned", - platform::TracerEventType::UserDefined, - 1); - platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); - } -} - -template <> -void Copy( - platform::NPUPlace dst_place, - void* dst, - platform::NPUPinnedPlace src_place, - const void* src, - size_t num, - void* stream) { - if (UNLIKELY(num == 0)) return; - - platform::SetNPUDeviceId(dst_place.device); - - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " - << dst_place << " by thream(" << stream << ")"; - - if (stream) { - platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU", - platform::TracerEventType::UserDefined, - 1); - platform::NPUMemcpyAsync(dst, - src, - num, - ACL_MEMCPY_HOST_TO_DEVICE, - reinterpret_cast(stream)); - } else { - // On NPU, async operation after sync operation is ok, while sync operation - // after async is not ok, since the async operation may not done. - // So, its needed to do wait before sync operation. - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - static_cast(pool.Get(dst_place))->Wait(); - - platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU", - platform::TracerEventType::UserDefined, - 1); - platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); - } -} - -// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace. -template <> -void Copy(phi::Place dst_place, - void* dst, - phi::Place src_place, - const void* src, - size_t num, - aclrtStream stream) { - if (src_place.GetType() == phi::AllocationType::CPU && - dst_place.GetType() == phi::AllocationType::CPU) { - platform::CPUPlace place_dst, place_src; - return Copy(place_dst, dst, place_src, src, num); - } else if (src_place.GetType() == phi::AllocationType::CPU && - dst_place.GetType() == phi::AllocationType::NPU) { - platform::NPUPlace place_dst(dst_place.GetDeviceId()); - platform::CPUPlace place_src; - return Copy(place_dst, dst, place_src, src, num, stream); - } else if (src_place.GetType() == phi::AllocationType::NPU && - dst_place.GetType() == phi::AllocationType::CPU) { - platform::NPUPlace place_src(src_place.GetDeviceId()); - platform::CPUPlace place_dst; - return Copy(place_dst, dst, place_src, src, num, stream); - } else if (src_place.GetType() == phi::AllocationType::NPU && - dst_place.GetType() == phi::AllocationType::NPU) { - platform::NPUPlace place_src(src_place.GetDeviceId()); - platform::NPUPlace place_dst(dst_place.GetDeviceId()); - return Copy(place_dst, dst, place_src, src, num, stream); - } else if (src_place.GetType() == phi::AllocationType::CPU && - dst_place.GetType() == phi::AllocationType::NPUPINNED) { - platform::CPUPlace place_src; - platform::NPUPinnedPlace place_dst; - return Copy(place_dst, dst, place_src, src, num); - } else if (src_place.GetType() == phi::AllocationType::NPUPINNED && - dst_place.GetType() == phi::AllocationType::CPU) { - platform::CPUPlace place_dst; - platform::NPUPinnedPlace place_src; - return Copy(place_dst, dst, place_src, src, num); - } else if (src_place.GetType() == phi::AllocationType::NPUPINNED && - dst_place.GetType() == phi::AllocationType::NPUPINNED) { - platform::NPUPinnedPlace place_dst; - platform::NPUPinnedPlace place_src; - return Copy(place_dst, dst, place_src, src, num); - } else if (src_place.GetType() == phi::AllocationType::NPUPINNED && - dst_place.GetType() == phi::AllocationType::NPU) { - platform::NPUPinnedPlace place_src; - platform::NPUPlace place_dst(dst_place.GetDeviceId()); - return Copy(place_dst, dst, place_src, src, num, stream); - } else if (src_place.GetType() == phi::AllocationType::NPU && - dst_place.GetType() == phi::AllocationType::NPUPINNED) { - platform::NPUPinnedPlace place_dst; - platform::NPUPlace place_src(src_place.GetDeviceId()); - return Copy(place_dst, dst, place_src, src, num, stream); -#ifdef PADDLE_WITH_CUSTOM_DEVICE - } else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT - dst_place.GetType() == phi::AllocationType::CUSTOM) { - platform::CPUPlace place_src; - platform::CustomPlace place_dst(dst_place); - return Copy(place_dst, dst, place_src, src, num, stream); - } else if (src_place.GetType() == phi::AllocationType::CUSTOM && // NOLINT - dst_place.GetType() == phi::AllocationType::CPU) { - platform::CustomPlace place_src(src_place); - platform::CPUPlace place_dst; - return Copy(place_dst, dst, place_src, src, num, stream); - } else if (src_place.GetType() == phi::AllocationType::CUSTOM && // NOLINT - dst_place.GetType() == phi::AllocationType::CUSTOM) { - platform::CustomPlace place_src(src_place); - platform::CustomPlace place_dst(dst_place); - return Copy(place_dst, dst, place_src, src, num, stream); -#endif - } -} - -// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace). -template <> -void Copy(phi::CPUPlace dst_place, - void* dst, - phi::Place src_place, - const void* src, - size_t num, - aclrtStream stream) { - Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); -} - -// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace). -template <> -void Copy(phi::Place dst_place, - void* dst, - phi::CPUPlace src_place, - const void* src, - size_t num, - aclrtStream stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); -} - -// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace) -template <> -void Copy(phi::NPUPlace dst_place, - void* dst, - phi::Place src_place, - const void* src, - size_t num, - aclrtStream stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), - dst, - src_place, - src, - num, - stream); -} - -// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) -template <> -void Copy(phi::Place dst_place, - void* dst, - phi::NPUPlace src_place, - const void* src, - size_t num, - aclrtStream stream) { - Copy(dst_place, - dst, - phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, - num, - stream); -} - -// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace) -template <> -void Copy(phi::NPUPinnedPlace dst_place, - void* dst, - phi::Place src_place, - const void* src, - size_t num, - aclrtStream stream) { - Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); -} - -// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) -template <> -void Copy(phi::Place dst_place, - void* dst, - phi::NPUPinnedPlace src_place, - const void* src, - size_t num, - aclrtStream stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); -} - -// NOTE: only for (CPUPlace) -> (NPUPinnedPlace) -template <> -void Copy(phi::NPUPinnedPlace dst_place, - void* dst, - phi::Place src_place, - const void* src, - size_t num) { - Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); -} - -// NOTE: only for (NPUPinnedPlace) -> (CPUPlace) -template <> -void Copy(phi::Place dst_place, - void* dst, - phi::NPUPinnedPlace src_place, - const void* src, - size_t num) { - Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); -} -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K @@ -1391,18 +982,6 @@ void Copy(phi::Place dst_place, std::memcpy(dst, src, num); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT - dst_place.GetType() == phi::AllocationType::NPUPINNED) { - std::memcpy(dst, src, num); - } else if (src_place.GetType() == phi::AllocationType::NPUPINNED && - dst_place.GetType() == phi::AllocationType::CPU) { - std::memcpy(dst, src, num); - } else if (src_place.GetType() == phi::AllocationType::NPUPINNED && - dst_place.GetType() == phi::AllocationType::NPUPINNED) { - std::memcpy(dst, src, num); - } -#endif #ifdef PADDLE_WITH_XPU else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT dst_place.GetType() == phi::AllocationType::CPU) { @@ -1488,8 +1067,7 @@ void Copy(phi::CPUPlace dst_place, } #if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_HIP) && \ - !defined(PADDLE_WITH_MLU) + !defined(PADDLE_WITH_HIP) template <> void Copy(phi::Place dst_place, diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 6884ded10cd84..3899468297cf6 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -21,8 +21,7 @@ #include "paddle/fluid/framework/var_type.h" #include "paddle/phi/backends/device_memory_aligment.h" #include "paddle/phi/kernels/funcs/math_function.h" -#ifdef PADDLE_WITH_ASCEND_CL -#endif + #include "paddle/fluid/framework/convert_utils.h" #ifdef PADDLE_WITH_MLU #include "paddle/fluid/operators/mlu/mlu_baseop.h" diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc index d0b20a2f08066..f6f7eb31cb8e6 100644 --- a/paddle/fluid/operators/copy_cross_scope_test.cc +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -148,16 +148,4 @@ TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { ctx.PartialInitWithAllocator(); Compare2(&scope, ctx, "copy_cross_scope"); } -#elif PADDLE_WITH_ASCEND_CL -TEST(copy_cross_scope, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare1(&scope, ctx, "copy_cross_scope"); -} - -TEST(copy_cross_scope_to_main_scope, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare2(&scope, ctx, "copy_cross_scope"); -} #endif diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 639d376485b4b..37b00eda81822 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -28,15 +28,9 @@ function(detection_library TARGET_NAME) PARENT_SCOPE) endfunction() -if(WITH_ASCEND_CL) - detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op_npu.cc) - detection_library(density_prior_box_op SRCS density_prior_box_op.cc - density_prior_box_op.cu density_prior_box_op_npu.cc) -else() - detection_library(box_coder_op SRCS box_coder_op.cc) - detection_library(density_prior_box_op SRCS density_prior_box_op.cc - density_prior_box_op.cu) -endif() +detection_library(box_coder_op SRCS box_coder_op.cc) +detection_library(density_prior_box_op SRCS density_prior_box_op.cc + density_prior_box_op.cu) if(WITH_XPU) detection_library(iou_similarity_op SRCS iou_similarity_op.cc @@ -49,11 +43,6 @@ elseif(WITH_MLU) iou_similarity_op_mlu.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc) detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc) -elseif(WITH_ASCEND_CL) - detection_library(iou_similarity_op SRCS iou_similarity_op.cc - iou_similarity_op_npu.cc) - detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc) - detection_library(yolo_box_op SRCS yolo_box_op.cc) else() detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 6d6739eed6702..8ff69a537ff7f 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -36,13 +36,6 @@ inline std::vector get_expand_times( *expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); expand_data = cpu_expand_tensor.data(); } -#ifdef PADDLE_WITH_ASCEND_CL - if (platform::is_npu_place(expand_tensor->place())) { - paddle::framework::TensorCopySync( - *expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); - expand_data = cpu_expand_tensor.data(); - } -#endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(expand_tensor->place())) { paddle::framework::TensorCopySync( diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h index 4343d42c2ccfc..0770dba0a44ad 100644 --- a/paddle/fluid/operators/expand_v2_op.h +++ b/paddle/fluid/operators/expand_v2_op.h @@ -37,13 +37,6 @@ inline std::vector get_expand_shape( *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); } -#ifdef PADDLE_WITH_ASCEND_CL - if (platform::is_npu_place(shape_tensor->place())) { - paddle::framework::TensorCopySync( - *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); - shape_data = cpu_shape_tensor.data(); - } -#endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(shape_tensor->place())) { paddle::framework::TensorCopySync( @@ -75,13 +68,6 @@ inline std::vector get_expand_shape( paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_epxand_shape.push_back(*temp.data()); } -#ifdef PADDLE_WITH_ASCEND_CL - else if (platform::is_npu_place(tensor->place())) { // NOLINT - phi::DenseTensor temp; - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - vec_epxand_shape.push_back(*temp.data()); - } -#endif #ifdef PADDLE_WITH_XPU else if (platform::is_xpu_place(tensor->place())) { // NOLINT phi::DenseTensor temp; diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 7fbdbfd6d41fe..61cc7dc9f4b64 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,10 +1,3 @@ -if(WITH_ASCEND_CL) - cc_library( - beam_search_npu - SRCS beam_search_npu.cc - DEPS npu_op_runner) -endif() - if(WITH_XPU) cc_library( beam_search_xpu @@ -13,9 +6,7 @@ if(WITH_XPU) endif() # please add new math_library in alphabetical order -if(WITH_ASCEND_CL) - math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) -elseif(WITH_MLU) +if(WITH_MLU) math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) else() math_library(concat_and_split DEPS concat_and_split_functor) diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc index 06af45d48506a..60d7a6ee14ba7 100644 --- a/paddle/fluid/operators/memcpy_d2h_op.cc +++ b/paddle/fluid/operators/memcpy_d2h_op.cc @@ -122,34 +122,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, MemcpyD2HInferShapeFunctor); -#ifdef PADDLE_WITH_ASCEND_CL -REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_d2h, - float, - ops::MemcpyD2HKernel, - double, - ops::MemcpyD2HKernel, - int8_t, - ops::MemcpyD2HKernel, - uint8_t, - ops::MemcpyD2HKernel, - int, - ops::MemcpyD2HKernel, - int64_t, - ops::MemcpyD2HKernel, - bool, - ops::MemcpyD2HKernel, - paddle::platform::bfloat16, - ops::MemcpyD2HKernel, - paddle::platform::complex, - ops::MemcpyD2HKernel, - paddle::platform::complex, - ops::MemcpyD2HKernel, - plat::float16, - ops::MemcpyD2HKernel, - int16_t, - ops::MemcpyD2HKernel); -#endif - #ifdef PADDLE_WITH_IPU REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_d2h, float, diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index 9754628b1b8eb..b9debd5e67a26 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -87,11 +87,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker { op->SetAttrMap(this->Attrs()); op->SetInput("X", this->Input("X")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); -#ifndef PADDLE_WITH_ASCEND_CL op->SetInput("Norm", this->Output("Norm")); -#else - op->SetInput("Out", this->Output("Out")); -#endif op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); } }; diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h index c0c05e9e0ba90..8a1d681766fab 100644 --- a/paddle/fluid/platform/device/device_wrapper.h +++ b/paddle/fluid/platform/device/device_wrapper.h @@ -25,9 +25,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/xpu/xpu_info.h" #endif -#ifdef PADDLE_WITH_ASCEND_CL -#endif - #ifdef PADDLE_WITH_MLU #include "paddle/fluid/platform/device/mlu/enforce.h" #include "paddle/fluid/platform/device/mlu/mlu_info.h" diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index c2c61f06446b8..5d02136a80375 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -248,31 +248,6 @@ void EmplaceDeviceContexts( PADDLE_THROW( platform::errors::Unimplemented("IPUPlace is not supported. Please " "re-compile with WITH_IPU option.")); -#endif - } else if (platform::is_npu_place(place)) { -#ifdef PADDLE_WITH_ASCEND_CL - EmplaceDeviceContext( - place_to_device_context, - place, - disable_setting_default_stream_for_allocator, - /*unused*/ stream_priority); -#else - PADDLE_THROW(platform::errors::Unimplemented( - "NPUPlace is not supported. Please " - "re-compile with WITH_ASCEND_CL option.")); -#endif - } else if (platform::is_npu_pinned_place(place)) { -#ifdef PADDLE_WITH_ASCEND_CL - EmplaceDeviceContext( - place_to_device_context, - place, - disable_setting_default_stream_for_allocator, - /*unused*/ stream_priority); -#else - PADDLE_THROW(platform::errors::Unimplemented( - "NPUPinnedPlace is not supported. Please re-compile with " - "WITH_ASCEND_CL " - "option.")); #endif } } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 67b6ab8f724cb..3c8ec21adbed8 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -68,8 +68,6 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_ASCEND_CL -#endif #include "paddle/phi/backends/device_ext.h" #include "paddle/phi/backends/stream.h" @@ -89,10 +87,6 @@ struct GpuDevice; #include "paddle/phi/backends/xpu/xpu_context.h" #endif -#ifdef PADDLE_WITH_ASCEND_CL -#include "acl/acl.h" -#endif - namespace paddle { namespace platform { @@ -150,86 +144,6 @@ namespace xpu = baidu::xpu::api; using XPUDeviceContext = phi::XPUContext; #endif -#ifdef PADDLE_WITH_ASCEND_CL -class NPUDeviceContext - : public DeviceContext, - public phi::TypeInfoTraits { - public: - explicit NPUDeviceContext(NPUPlace place); - virtual ~NPUDeviceContext(); - Eigen::DefaultDevice* eigen_device() const { return nullptr; } - const Place& GetPlace() const override; - aclrtContext context() const; - - /*! \brief Wait for all operations completion in the stream. */ - void Wait() const override; - - /*! \brief Return npu stream in the device context. */ - aclrtStream stream() const; - - template - void AddStreamCallback(Callback&& callback) const { - return stream_->AddCallback(callback); - } - - void WaitStreamCallback() const { return stream_->WaitCallback(); } - -#if defined(PADDLE_WITH_ASCEND_CL) - /*! \brief Return hccl communicators. */ - HcclComm hccl_comm() const { return hccl_comm_; } - - /*! \brief Set hccl communicators. */ - void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; } -#endif - - // template - // void AddStreamCallback(Callback&& callback) const { - // return stream_->AddCallback(callback); - // } - - // void WaitStreamCallback() const { return stream_->WaitCallback(); } - - static const char* name() { return "NPUDeviceContext"; } - - private: - NPUPlace place_; - aclrtContext context_; - -#ifdef PADDLE_WITH_ASCEND_CL - // HCCLContext_t hccl_context_; - HcclComm hccl_comm_{nullptr}; -#endif - - // Need to be the same with other DeviceContext, - // Eventhough eigen_device_ is not used in NPU - // NOTE(zhiqiu): why need? - std::unique_ptr eigen_device_; - std::shared_ptr stream_; - - DISABLE_COPY_AND_ASSIGN(NPUDeviceContext); -}; - -// Currently, NPUPinnedDeviceContext is only used to data copying. -class NPUPinnedDeviceContext - : public DeviceContext, - public phi::TypeInfoTraits { - public: - NPUPinnedDeviceContext(); - explicit NPUPinnedDeviceContext(NPUPinnedPlace place); - - const Place& GetPlace() const override; - - Eigen::DefaultDevice* eigen_device() const; - - static const char* name() { return "NPUPinnedDeviceContext"; } - - private: - NPUPinnedPlace place_; - std::unique_ptr eigen_device_; -}; - -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) using CUDAPinnedDeviceContext = phi::GPUPinnedContext; #endif @@ -264,18 +178,6 @@ template <> struct DefaultDeviceContextType; #endif -#ifdef PADDLE_WITH_ASCEND_CL -template <> -struct DefaultDeviceContextType { - using TYPE = paddle::platform::NPUDeviceContext; -}; - -template <> -struct DefaultDeviceContextType { - using TYPE = paddle::platform::NPUPinnedDeviceContext; -}; -#endif - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> struct DefaultDeviceContextType { diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 8659d8be902b6..dc40fbe186e88 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -38,12 +38,6 @@ USE_EVENT_WAIT(kCUDA, kCUDA) USE_EVENT_WAIT(kCPU, kCUDA) #endif -#ifdef PADDLE_WITH_ASCEND_CL -USE_EVENT(kNPU); -USE_EVENT_WAIT(kNPU, kNPU) -USE_EVENT_WAIT(kCPU, kNPU) -#endif - #ifdef PADDLE_WITH_CUSTOM_DEVICE USE_EVENT(kCUSTOM_DEVICE); USE_EVENT_WAIT(kCUSTOM_DEVICE, kCUSTOM_DEVICE) diff --git a/paddle/fluid/platform/device_event_npu.cc b/paddle/fluid/platform/device_event_npu.cc deleted file mode 100644 index ba3ea8ffcda38..0000000000000 --- a/paddle/fluid/platform/device_event_npu.cc +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef PADDLE_WITH_ASCEND_CL - -#include "paddle/fluid/platform/device_event_base.h" -#include "paddle/fluid/platform/event.h" -namespace paddle { -namespace platform { -struct NPUDeviceEventWrapper { - explicit NPUDeviceEventWrapper(const platform::Place& place) { - PADDLE_ENFORCE_EQ( - platform::is_npu_place(place), - true, - platform::errors::PreconditionNotMet( - "Required device shall be NPUPlace, but received %d. ", place)); - - device_id_ = place.device; - PADDLE_ENFORCE_GT( - device_id_, - -1, - platform::errors::PreconditionNotMet( - "Required DeviceOption.device_id > -1, but received %d. ", - device_id_)); - inner_event_ = NpuEventResourcePool::Instance().New(device_id_); - } - std::shared_ptr inner_event_; - int device_id_; -}; - -void DeviceEventCreateNPU(DeviceEvent* event, - const platform::Place& place, - unsigned int) { - event->InitEvent(std::make_shared(place)); -} - -void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) { - auto* wrapper = static_cast(event->GetEvent().get()); - auto* npu_dev_ctx = dynamic_cast(context); - PADDLE_ENFORCE_NOT_NULL( - npu_dev_ctx, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into NPUDeviceContext.")); - NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream()); -} - -bool DeviceEventQueryNPU(const DeviceEvent* event) { - auto* wrapper = static_cast(event->GetEvent().get()); - PADDLE_ENFORCE_NOT_NULL( - wrapper, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast event into NPUDeviceEventWrapper.")); - aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; - platform::NPUEventQuery(wrapper->inner_event_.get(), &status); - return ACL_EVENT_STATUS_COMPLETE == status; -} - -void DeviceEventFinishNPU(const DeviceEvent* event) { - auto* wrapper = static_cast(event->GetEvent().get()); - NPUEventSynchronize(wrapper->inner_event_.get()); -} - -void DeviceEventNPUWaitNPU(const DeviceEvent* event, - const DeviceContext* context) { - auto* wrapper = static_cast(event->GetEvent().get()); - auto* npu_dev_ctx = dynamic_cast(context); - PADDLE_ENFORCE_NOT_NULL( - npu_dev_ctx, - platform::errors::PreconditionNotMet( - "Failed to dynamic_cast context into NPUDeviceContext.")); - NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get()); -} - -void DeviceEventCPUWaitNPU(const DeviceEvent* event, - const DeviceContext* context) { - DeviceEventFinishNPU(event); -} - -void DeviceEventSetFinishedNPU(const DeviceEvent* event) { - // do nothing -} - -void EventResetNPU(const DeviceEvent* event) { - // do nothing -} - -} // namespace platform -} // namespace paddle - -using ::paddle::platform::kCPU; -using ::paddle::platform::kNPU; -REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU) -REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU) -REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU) -REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU) -REGISTER_EVENT_SET_FINISHED_FUNCTION( - kNPU, paddle::platform::DeviceEventSetFinishedNPU) -REGISTER_EVENT_WAIT_FUNCTION(kNPU, - kNPU, - paddle::platform::DeviceEventNPUWaitNPU) -REGISTER_EVENT_WAIT_FUNCTION(kCPU, - kNPU, - paddle::platform::DeviceEventCPUWaitNPU) -REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU) -#endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index b64bf81dc0d05..afa689a3f904d 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -54,7 +54,6 @@ void* GetCUDADsoHandle() { return phi::dynload::GetCUDADsoHandle(); } void* GetWarpCTCDsoHandle() { return phi::dynload::GetWarpCTCDsoHandle(); } void* GetNCCLDsoHandle() { return phi::dynload::GetNCCLDsoHandle(); } -void* GetHCCLDsoHandle() { return phi::dynload::GetHCCLDsoHandle(); } void* GetTensorRtDsoHandle() { return phi::dynload::GetTensorRtDsoHandle(); } diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 50714dfb302eb..10b985e0b2044 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -37,7 +37,6 @@ void* GetNVRTCDsoHandle(); void* GetCUDADsoHandle(); void* GetWarpCTCDsoHandle(); void* GetNCCLDsoHandle(); -void* GetHCCLDsoHandle(); void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); void* GetLAPACKDsoHandle(); diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index 497a6b3cb98c2..e9bea4d87f369 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \ - defined(PADDLE_WITH_CNCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL) #include "paddle/fluid/platform/gen_comm_id_helper.h" #include diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h index 5bd81faafcc18..0766e2e91f862 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.h +++ b/paddle/fluid/platform/gen_comm_id_helper.h @@ -14,9 +14,8 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) || \ - defined(PADDLE_WITH_CNCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL) #include #include #include diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index cb99a60bd6e44..43d7c61668701 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -187,17 +187,6 @@ void InitDevices() { LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime."; } #endif -#ifdef PADDLE_WITH_ASCEND_CL - // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL - platform::AclInstance::Instance(); // NOLINT - try { - // use user specified XPUs in single-node multi-process mode. - devices = platform::GetSelectedNPUDevices(); - } catch (const std::exception &exp) { - LOG(WARNING) << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found " - "in runtime."; - } -#endif #ifdef PADDLE_WITH_IPU try { // use user specified IPUs. diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc deleted file mode 100644 index f64ed106bd730..0000000000000 --- a/paddle/fluid/pybind/ascend_wrapper_py.cc +++ /dev/null @@ -1,917 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_ASCEND_CL -#include - -#ifdef _POSIX_C_SOURCE -#undef _POSIX_C_SOURCE -#endif - -#ifdef _XOPEN_SOURCE -#undef _XOPEN_SOURCE -#endif - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/fleet/ascend_wrapper.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/pybind/ascend_wrapper_py.h" - -using namespace ge; // NOLINT -namespace py = pybind11; - -namespace paddle { -namespace pybind { - -#ifdef PADDLE_WITH_ASCEND_STRING -using AscendString = AscendString; -#else -using AscendString = std::string; -#endif - -void BindAscendWrapper(py::module *m) { - py::class_>(*m, "AscendInstance") - .def(py::init([]() { return framework::AscendInstance::GetInstance(); })) - .def("init_global_resources", - &framework::AscendInstance::InitGlobalResouces, - py::call_guard()) - .def("destroy_global_resources", - &framework::AscendInstance::DestroyGlobalResouces, - py::call_guard()) - .def("add_ascend_subgraph", - &framework::AscendInstance::AddAscendSubgraph, - py::call_guard()); -} - -std::map convert_map( - const std::map &options) { - std::map rets; - for (auto &option : options) { - AscendString key = option.first.c_str(); - AscendString val = option.second.c_str(); - rets[key] = val; - } - return rets; -} - -ge::Status ge_initialize( - std::map &options) { // NOLINT - py::gil_scoped_release release; - auto init_options = convert_map(options); - ge::Status res = ge::GEInitialize(init_options); - PADDLE_ENFORCE_EQ( - res, - ge::SUCCESS, - platform::errors::Fatal("ge initialize not success:%d", res)); - py::gil_scoped_acquire acquire; - return res; -} - -enum AttrType { - AT_INT64 = 0, - AT_INT32, - AT_UINT32, - AT_LIST_INT64, - AT_LIST_INT32, - AT_LIST_UINT32, - AT_FLOAT, - AT_LIST_FLOAT, - AT_ATTR_VALUE, - AT_STRING, - AT_LIST_STRING, - AT_BOOL, - AT_LIST_BOOL, - AT_TENSOR, - AT_LIST_TENSOR, - AT_LIST_UINT8, - AT_LIST_LIST_INT64, - AT_LIST_DT, - AT_DT, - AT_LIST_NAMEATTR, - AT_NAMEATTR -}; - -#ifdef PADDLE_WITH_ASCEND -void BindAscendDevice(py::module *m) { - py::class_(*m, "NPUDevice") - .def_static( - "get_device_count", - static_cast(&platform::ascend::NPUDevice::GetDeviceCount)); -} -#endif - -void BindAscendGraph(py::module *m) { - m->def("ge_initialize", &ge_initialize, "GEInitialize"); - m->def("ge_finalize", &GEFinalize, "GEFinalize"); - - // enum - py::enum_(*m, "GEGraphRunMode") - .value("PREDICTION", GraphRunMode::PREDICTION) - .value("TRAIN", GraphRunMode::TRAIN) - .export_values(); - - py::enum_(*m, "GEDataType") - .value("DT_FLOAT", DataType::DT_FLOAT) - .value("DT_FLOAT16", DataType::DT_FLOAT16) - .value("DT_INT8", DataType::DT_INT8) - .value("DT_INT16", DataType::DT_INT16) - .value("DT_UINT16", DataType::DT_UINT16) - .value("DT_UINT8", DataType::DT_UINT8) - .value("DT_INT32", DataType::DT_INT32) - .value("DT_INT64", DataType::DT_INT64) - .value("DT_UINT32", DataType::DT_UINT32) - .value("DT_UINT64", DataType::DT_UINT64) - .value("DT_BOOL", DataType::DT_BOOL) - .value("DT_DOUBLE", DataType::DT_DOUBLE) - .value("DT_STRING", DataType::DT_STRING) - .value("DT_DUAL_SUB_INT8", DataType::DT_DUAL_SUB_INT8) - .value("DT_DUAL_SUB_UINT8", DataType::DT_DUAL_SUB_UINT8) - .value("DT_COMPLEX64", DataType::DT_COMPLEX64) - .value("DT_COMPLEX128", DataType::DT_COMPLEX128) - .value("DT_QINT8", DataType::DT_QINT8) - .value("DT_QINT16", DataType::DT_QINT16) - .value("DT_QINT32", DataType::DT_QINT32) - .value("DT_QUINT8", DataType::DT_QUINT8) - .value("DT_QUINT16", DataType::DT_QUINT16) - .value("DT_RESOURCE", DataType::DT_RESOURCE) - .value("DT_STRING_REF", DataType::DT_STRING_REF) - .value("DT_DUAL", DataType::DT_DUAL) - .value("DT_UNDEFINED", DataType::DT_UNDEFINED) - .export_values(); - - py::enum_(*m, "GEFormat") - .value("FORMAT_NCHW", Format::FORMAT_NCHW) - .value("FORMAT_NHWC", Format::FORMAT_NHWC) - .value("FORMAT_ND", Format::FORMAT_ND) - .value("FORMAT_NC1HWC0", Format::FORMAT_NC1HWC0) - .value("FORMAT_FRACTAL_Z", Format::FORMAT_FRACTAL_Z) - .value("FORMAT_NC1C0HWPAD", Format::FORMAT_NC1C0HWPAD) - .value("FORMAT_NHWC1C0", Format::FORMAT_NHWC1C0) - .value("FORMAT_FSR_NCHW", Format::FORMAT_FSR_NCHW) - .value("FORMAT_FRACTAL_DECONV", Format::FORMAT_FRACTAL_DECONV) - .value("FORMAT_C1HWNC0", Format::FORMAT_C1HWNC0) - .value("FORMAT_FRACTAL_DECONV_TRANSPOSE", - Format::FORMAT_FRACTAL_DECONV_TRANSPOSE) - .value("FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS", - Format::FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS) - .value("FORMAT_NC1HWC0_C04", Format::FORMAT_NC1HWC0_C04) - .value("FORMAT_FRACTAL_Z_C04", Format::FORMAT_FRACTAL_Z_C04) - .value("FORMAT_CHWN", Format::FORMAT_CHWN) - .value("FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS", - Format::FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS) - .value("FORMAT_HWCN", Format::FORMAT_HWCN) - .value("FORMAT_NC1KHKWHWC0", Format::FORMAT_NC1KHKWHWC0) - .value("FORMAT_BN_WEIGHT", Format::FORMAT_BN_WEIGHT) - .value("FORMAT_FILTER_HWCK", Format::FORMAT_FILTER_HWCK) - .value("FORMAT_HASHTABLE_LOOKUP_LOOKUPS", - Format::FORMAT_HASHTABLE_LOOKUP_LOOKUPS) - .value("FORMAT_HASHTABLE_LOOKUP_KEYS", - Format::FORMAT_HASHTABLE_LOOKUP_KEYS) - .value("FORMAT_HASHTABLE_LOOKUP_VALUE", - Format::FORMAT_HASHTABLE_LOOKUP_VALUE) - .value("FORMAT_HASHTABLE_LOOKUP_OUTPUT", - Format::FORMAT_HASHTABLE_LOOKUP_OUTPUT) - .value("FORMAT_HASHTABLE_LOOKUP_HITS", - Format::FORMAT_HASHTABLE_LOOKUP_HITS) - .value("FORMAT_C1HWNCoC0", Format::FORMAT_C1HWNCoC0) - .value("FORMAT_MD", Format::FORMAT_MD) - .value("FORMAT_NDHWC", Format::FORMAT_NDHWC) - .value("FORMAT_FRACTAL_ZZ", Format::FORMAT_FRACTAL_ZZ) - .value("FORMAT_FRACTAL_NZ", Format::FORMAT_FRACTAL_NZ) - .value("FORMAT_NCDHW", Format::FORMAT_NCDHW) - .value("FORMAT_DHWCN", Format::FORMAT_DHWCN) - .value("FORMAT_NDC1HWC0", Format::FORMAT_NDC1HWC0) - .value("FORMAT_FRACTAL_Z_3D", Format::FORMAT_FRACTAL_Z_3D) - .value("FORMAT_CN", Format::FORMAT_CN) - .value("FORMAT_NC", Format::FORMAT_NC) - .value("FORMAT_DHWNC", Format::FORMAT_DHWNC) - .value("FORMAT_FRACTAL_Z_3D_TRANSPOSE", - Format::FORMAT_FRACTAL_Z_3D_TRANSPOSE) - .value("FORMAT_FRACTAL_ZN_LSTM", Format::FORMAT_FRACTAL_ZN_LSTM) - .value("FORMAT_FRACTAL_Z_G", Format::FORMAT_FRACTAL_Z_G) - .value("FORMAT_RESERVED", Format::FORMAT_RESERVED) - .value("FORMAT_ALL", Format::FORMAT_ALL) - .value("FORMAT_NULL", Format::FORMAT_NULL) - .export_values(); - - py::enum_(*m, "GEUnknowShapeOpType") - .value("DEPEND_IN_SHAPE", UnknowShapeOpType::DEPEND_IN_SHAPE) - .value("DEPEND_CONST_VALUE", UnknowShapeOpType::DEPEND_CONST_VALUE) - .value("DEPEND_SHAPE_RANGE", UnknowShapeOpType::DEPEND_SHAPE_RANGE) - .value("DEPEND_COMPUTE", UnknowShapeOpType::DEPEND_COMPUTE) - .export_values(); - - py::enum_(*m, "GEDeviceType") - .value("NPU", DeviceType::NPU) - .value("CPU", DeviceType::CPU) - .export_values(); - - py::enum_(*m, "GEAttrType") - .value("AT_INT64", AttrType::AT_INT64) - .value("AT_INT32", AttrType::AT_INT32) - .value("AT_UINT32", AttrType::AT_UINT32) - .value("AT_LIST_INT64", AttrType::AT_LIST_INT64) - .value("AT_LIST_INT32", AttrType::AT_LIST_INT32) - .value("AT_LIST_UINT32", AttrType::AT_LIST_UINT32) - .value("AT_FLOAT", AttrType::AT_FLOAT) - .value("AT_LIST_FLOAT", AttrType::AT_LIST_FLOAT) - .value("AT_ATTR_VALUE", AttrType::AT_ATTR_VALUE) - .value("AT_STRING", AttrType::AT_STRING) - .value("AT_LIST_STRING", AttrType::AT_LIST_STRING) - .value("AT_BOOL", AttrType::AT_BOOL) - .value("AT_LIST_BOOL", AttrType::AT_LIST_BOOL) - .value("AT_TENSOR", AttrType::AT_TENSOR) - .value("AT_LIST_TENSOR", AttrType::AT_LIST_TENSOR) - .value("AT_LIST_UINT8", AttrType::AT_LIST_UINT8) - .value("AT_LIST_LIST_INT64", AttrType::AT_LIST_LIST_INT64) - .value("AT_LIST_DT", AttrType::AT_LIST_DT) - .value("AT_DT", AttrType::AT_DT) - .value("AT_LIST_NAMEATTR", AttrType::AT_LIST_NAMEATTR) - .value("AT_NAMEATTR", AttrType::AT_NAMEATTR) - .export_values(); - - // 类封装 - py::class_(*m, "GESession") - .def(py::init([](const std::map &options) { - return std::unique_ptr( - new ge::Session(convert_map(options))); - })) - .def( - "add_graph", - (ge::Status(Session::*)(uint32_t, const Graph &)) & Session::AddGraph) - .def("add_graph", - [](Session &ss, - uint32_t index, - const Graph &graph, - const std::map &options) { - return ss.AddGraph(index, graph, convert_map(options)); - }) - .def("remove_graph", &Session::RemoveGraph) - .def( - "run_graph", - [](Session &ss, - uint32_t graphId, - const std::vector &inputs) -> py::tuple { - std::vector outputs; - ge::Status res = ss.RunGraph(graphId, inputs, outputs); - return py::make_tuple(outputs, res); - }, - py::call_guard()) - .def("build_graph", &Session::BuildGraph) - .def("run_graph_async", &Session::RunGraphAsync) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("register_call_back_func", - static_cast( - &ge::Session::RegisterCallBackFunc)) -#else - .def("register_call_back_func", - (Status (Session::*)( // NOLINT - const std::string &, - std::function ¶ms_list)>)) & - Session::RegisterCallBackFunc) -#endif - .def("is_graph_need_rebuild", &Session::IsGraphNeedRebuild); - - py::class_(*m, "GEGraph") - .def(py::init<>()) - .def(py::init()) - .def("set_inputs", &Graph::SetInputs) - .def("set_outputs", - (Graph & (Graph::*)(const std::vector &)) & - Graph::SetOutputs) - .def("set_outputs", - (Graph & (Graph::*)(const std::vector< - std::pair>> &)) & - Graph::SetOutputs) - .def("set_outputs", - (Graph & - (Graph::*)(const std::vector> - &)) & - Graph::SetOutputs) - .def("set_targets", &Graph::SetTargets) - .def("is_valid", &Graph::IsValid) - .def("add_op", &Graph::AddOp) - .def("find_op_by_name", - [](Graph &graph, const char *name) -> py::tuple { - ge::Operator op; - graphStatus status = graph.FindOpByName(name, op); - return py::make_tuple(op, status); - }) - .def("find_op_by_type", - [](Graph &graph, const char *type) -> py::tuple { - std::vector ops; - graphStatus status = graph.FindOpByType(type, ops); - return py::make_tuple(ops, status); - }) - .def("get_all_op_name", - [](Graph &graph) -> py::tuple { - std::vector op_name; - graphStatus status = graph.GetAllOpName(op_name); - return py::make_tuple(op_name, status); - }) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("save_to_file", - static_cast( - &ge::Graph::SaveToFile)) - .def("load_from_file", - static_cast( - &Graph::LoadFromFile)) - .def("get_name", - static_cast( - &Graph::GetName)) -#else - .def("save_to_file", &Graph::SaveToFile) - .def("load_from_file", &Graph::LoadFromFile) - .def("get_name", &Graph::GetName) -#endif - .def("set_need_iteration", &Graph::SetNeedIteration); - - py::class_(*m, "GEOperator") - .def(py::init<>()) - .def(py::init()) - .def(py::init()) - .def("is_empty", &Operator::IsEmpty) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("get_name", - static_cast( - &Operator::GetName)) - .def("get_op_type", - static_cast( - &Operator::GetOpType)) - .def("set_input", - (Operator & (Operator::*)(const char *, const Operator &)) & - Operator::SetInput) - .def("set_input", - (Operator & - (Operator::*)(const char *, const Operator &, const char *)) & - Operator::SetInput) - .def( - "set_input", - (Operator & (Operator::*)(const char *, const Operator &, uint32_t)) & - Operator::SetInput) -#else - .def("get_name", &Operator::GetName) - .def("get_op_type", &Operator::GetOpType) - .def("set_input", - (Operator & (Operator::*)(const std::string &, const Operator &)) & - Operator::SetInput) - .def("set_input", - (Operator & (Operator::*)(const std::string &, const Operator &, - const std::string &)) & - Operator::SetInput) - .def("set_input", (Operator & (Operator::*)(const std::string &, - const Operator &, uint32_t)) & - Operator::SetInput) -#endif - .def("add_control_input", &Operator::AddControlInput) - .def("get_input_const_data", - [](Operator &op, const char *dst_name) -> py::tuple { - Tensor data; - graphStatus res = op.GetInputConstData(dst_name, data); - return py::make_tuple(data, res); - }) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("get_input_desc", - (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetInputDesc) - .def("get_input_desc", - [](Operator &op, const std::string &name) { - return op.GetInputDescByName(name.c_str()); - }) - .def("get_dynamic_output_num", - static_cast( - &Operator::GetDynamicOutputNum)) - .def("get_dynamic_input_num", - static_cast( - &Operator::GetDynamicInputNum)) -#else - .def("get_input_desc", - (TensorDesc (Operator::*)(const std::string &) const) & - Operator::GetInputDesc) - .def("get_input_desc", - (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc) - .def("get_dynamic_output_num", &Operator::GetDynamicOutputNum) - .def("get_dynamic_input_num", &Operator::GetDynamicInputNum) -#endif - .def("try_get_input_desc", - [](Operator &op, const char *name) -> py::tuple { - TensorDesc tensor_desc; - graphStatus status = op.TryGetInputDesc(name, tensor_desc); - return py::make_tuple(tensor_desc, status); - }) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("update_input_desc", - static_cast(&Operator::UpdateInputDesc)) - .def("get_output_desc", - [](Operator &op, const std::string &name) { - return op.GetOutputDescByName(name.c_str()); - }) - .def("get_output_desc", - (TensorDesc(Operator::*)(uint32_t) const) & Operator::GetOutputDesc) - .def("update_output_desc", - static_cast(&Operator::UpdateOutputDesc)) - .def("get_dynamic_input_desc", - static_cast(&Operator::GetDynamicInputDesc)) - .def("update_dynamic_input_desc", - static_cast( - &Operator::UpdateDynamicInputDesc)) - .def("get_dynamic_output_desc", - static_cast(&Operator::GetDynamicOutputDesc)) - .def("update_dynamic_output_desc", - static_cast( - &Operator::UpdateDynamicOutputDesc)) -#else - .def("update_input_desc", &Operator::UpdateInputDesc) - .def("get_output_desc", - (TensorDesc (Operator::*)(const std::string &) const) & - Operator::GetOutputDesc) - .def("get_output_desc", - (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc) - .def("update_output_desc", &Operator::UpdateOutputDesc) - .def("get_dynamic_input_desc", &Operator::GetDynamicInputDesc) - .def("update_dynamic_input_desc", &Operator::UpdateDynamicInputDesc) - .def("get_dynamic_output_desc", &Operator::GetDynamicOutputDesc) - .def("update_dynamic_output_desc", &Operator::UpdateDynamicOutputDesc) -#endif - .def("infer_shape_and_type", &Operator::InferShapeAndType) - .def("set_inference_context", &Operator::SetInferenceContext) - .def("get_inference_context", &Operator::GetInferenceContext) - .def("verify_all_attr", &Operator::VerifyAllAttr) - .def("get_inputs_size", &Operator::GetInputsSize) - .def("get_outputs_size", &Operator::GetOutputsSize) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("get_all_attr_names_and_types", - static_cast &) const>( - &Operator::GetAllAttrNamesAndTypes)) -#else - .def("get_all_attr_names_and_types", &Operator::GetAllAttrNamesAndTypes) -#endif - .def("set_attr_int64", - [](Operator &op, const char *name, int64_t value) -> Operator & { - int64_t tar = (int64_t)value; - return op.SetAttr(name, tar); - }) - .def("set_attr_int32", - [](Operator &op, const char *name, int32_t value) -> Operator & { - int32_t tar = (int32_t)value; - return op.SetAttr(name, tar); - }) - .def("set_attr_uint32", - [](Operator &op, const char *name, uint32_t value) -> Operator & { - uint32_t tar = (uint32_t)value; - return op.SetAttr(name, tar); - }) - .def("set_attr_vec_int64", - [](Operator &op, - const char *name, - const std::vector &value) -> Operator & { - int len = value.size(); - std::vector tar; - int64_t tmp; - for (int i = 0; i < len; i++) { - tmp = (int64_t)value[i]; - tar.push_back(tmp); - } - return op.SetAttr(name, tar); - }) - .def("set_attr_vec_int32", - [](Operator &op, - const char *name, - const std::vector &value) -> Operator & { - int len = value.size(); - std::vector tar; - int32_t tmp; - for (int i = 0; i < len; i++) { - tmp = (int32_t)value[i]; - tar.push_back(tmp); - } - return op.SetAttr(name, tar); - }) - .def("set_attr_vec_uint32", - [](Operator &op, - const char *name, - const std::vector &value) -> Operator & { - int len = value.size(); - std::vector tar; - uint32_t tmp; - for (int i = 0; i < len; i++) { - tmp = (uint32_t)value[i]; - tar.push_back(tmp); - } - return op.SetAttr(name, tar); - }) - .def("set_attr_list_int64", - [](Operator &op, - const char *name, - std::initializer_list &attrValue) -> Operator & { - return op.SetAttr(name, std::move(attrValue)); - }) - .def("set_attr_attrvalue", - [](Operator &op, const char *name, AttrValue &attrValue) - -> Operator & { return op.SetAttr(name, std::move(attrValue)); }) - .def("set_attr_float", - [](Operator &op, const char *name, float value) -> Operator & { - float tar = static_cast(value); - return op.SetAttr(name, tar); - }) - .def("set_attr_vec_float", - [](Operator &op, - const char *name, - const std::vector &value) -> Operator & { - int len = value.size(); - std::vector tar; - float tmp; - for (int i = 0; i < len; i++) { - tmp = static_cast(value[i]); - tar.push_back(tmp); - } - return op.SetAttr(name, tar); - }) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("set_attr_string", - (Operator & (Operator::*)(const char *, const char *)) & - Operator::SetAttr) - .def("set_attr_vec_string", - (Operator & - (Operator::*)(const char *, const std::vector &)) & - Operator::SetAttr) -#else - .def("set_attr_string", (Operator & (Operator::*)(const std::string &, - const std::string &)) & - Operator::SetAttr) - .def("set_attr_vec_string", - (Operator & (Operator::*)(const std::string &, - const std::vector &)) & - Operator::SetAttr) -#endif - .def("set_attr_bool", - [](Operator &op, const char *name, bool value) -> Operator & { - if (value) - return op.SetAttr(name, true); - else - return op.SetAttr(name, false); - }) - .def("set_attr_vec_bool", - [](Operator &op, - const char *name, - const std::vector &value) -> Operator & { - int len = value.size(); - std::vector tar; - for (int i = 0; i < len; i++) { - if (value[i]) - tar.push_back(true); - else - tar.push_back(false); - } - return op.SetAttr(name, tar); - }) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("set_attr_tensor", - (Operator & (Operator::*)(const char *, const Tensor &)) & - Operator::SetAttr) - .def("set_attr_vec_tensor", - (Operator & - (Operator::*)(const char *, const std::vector &)) & - Operator::SetAttr) -#else - .def("set_attr_tensor", - (Operator & (Operator::*)(const std::string &, const Tensor &)) & - Operator::SetAttr) - .def("set_attr_vec_tensor", - (Operator & - (Operator::*)(const std::string &, const std::vector &)) & - Operator::SetAttr) -#endif - .def("set_attr_vec_uint8", - [](Operator &op, - const char *name, - const std::vector &value) -> Operator & { - int len = value.size(); - std::vector tar; - uint8_t tmp; - for (int i = 0; i < len; i++) { - tmp = (uint8_t)value[i]; - tar.push_back(tmp); - } - return op.SetAttr(name, tar); - }) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("set_attr_vec_vec_int64", - (Operator & - (Operator::*)(const char *, - const std::vector> &)) & - Operator::SetAttr) -#else - .def("set_attr_vec_vec_int64", - (Operator & - (Operator::*)(const std::string &, - const std::vector> &)) & - Operator::SetAttr) -#endif - .def("set_attr_vec_dtype", - [](Operator &op, - const char *name, - const std::vector &value) -> Operator & { - int len = value.size(); - std::vector tar; - ge::DataType tmp; - for (int i = 0; i < len; i++) { - tmp = (ge::DataType)value[i]; - tar.push_back(tmp); - } - return op.SetAttr(name, tar); - }) - .def("set_attr_dtype", - [](Operator &op, - const char *name, - const DataType &value) -> Operator & { - ge::DataType tar = (ge::DataType)value; - return op.SetAttr(name, tar); - }) - .def("get_attr", - [](Operator &op, const char *name, AttrType type) -> py::tuple { - graphStatus res = -1; - switch (type) { - case AT_INT64: { - int64_t i_64_av; - res = op.GetAttr(name, i_64_av); - return py::make_tuple(i_64_av, res); - } break; - case AT_INT32: { - int32_t i_32_av; - res = op.GetAttr(name, i_32_av); - return py::make_tuple(i_32_av, res); - } break; - case AT_UINT32: { - uint32_t ui_32_av; - res = op.GetAttr(name, ui_32_av); - return py::make_tuple(ui_32_av, res); - } break; - case AT_LIST_INT64: { - std::vector v_i_64_av; - res = op.GetAttr(name, v_i_64_av); - return py::make_tuple(v_i_64_av, res); - } break; - case AT_LIST_INT32: { - std::vector v_i_32_av; - res = op.GetAttr(name, v_i_32_av); - return py::make_tuple(v_i_32_av, res); - } break; - case AT_LIST_UINT32: { - std::vector v_ui_32_av; - res = op.GetAttr(name, v_ui_32_av); - return py::make_tuple(v_ui_32_av, res); - } break; - case AT_FLOAT: { - float f_av; - res = op.GetAttr(name, f_av); - return py::make_tuple(f_av, res); - } break; - case AT_LIST_FLOAT: { - std::vector v_f_av; - res = op.GetAttr(name, v_f_av); - return py::make_tuple(v_f_av, res); - } break; - case AT_ATTR_VALUE: { - AttrValue o_av; - res = op.GetAttr(name, o_av); - return py::make_tuple(o_av, res); - } break; - case AT_STRING: { - AscendString s_av; - res = op.GetAttr(name, s_av); - return py::make_tuple(s_av, res); - } break; - case AT_LIST_STRING: { - std::vector v_s_av; - res = op.GetAttr(name, v_s_av); - return py::make_tuple(v_s_av, res); - } break; - case AT_BOOL: { - bool b_av; - res = op.GetAttr(name, b_av); - return py::make_tuple(b_av, res); - } break; - case AT_LIST_BOOL: { - std::vector v_b_av; - res = op.GetAttr(name, v_b_av); - return py::make_tuple(v_b_av, res); - } break; - case AT_TENSOR: { - Tensor t_av; - res = op.GetAttr(name, t_av); - return py::make_tuple(t_av, res); - } break; - case AT_LIST_TENSOR: { - std::vector v_t_av; - res = op.GetAttr(name, v_t_av); - return py::make_tuple(v_t_av, res); - } break; - case AT_LIST_UINT8: { - std::vector v_ui_8_av; - res = op.GetAttr(name, v_ui_8_av); - return py::make_tuple(v_ui_8_av, res); - } break; - case AT_LIST_LIST_INT64: { - std::vector> v_v_i_64_av; - res = op.GetAttr(name, v_v_i_64_av); - return py::make_tuple(v_v_i_64_av, res); - } break; - case AT_DT: { - ge::DataType dt_av; - res = op.GetAttr(name, dt_av); - return py::make_tuple(dt_av, res); - } break; - case AT_LIST_DT: { - std::vector v_dt_av; - res = op.GetAttr(name, v_dt_av); - return py::make_tuple(v_dt_av, res); - } break; - default: - return py::make_tuple(0, res); - break; - } - }) - .def("break_connect", &Operator::BreakConnect) - .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("get_subgraph_names", - static_cast &) const>(&Operator::GetSubgraphNames)) - .def("get_subgraph_builder", - static_cast(&Operator::GetSubgraphBuilder)) - .def("get_subgraph", - static_cast( - &Operator::GetSubgraph)) - .def("get_dynamic_subgraph_builder", - static_cast( - &Operator::GetDynamicSubgraphBuilder)) - .def("get_dynamic_subgraph", - static_cast(&Operator::GetDynamicSubgraph)); -#else - .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount) - .def("get_subgraph_names", &Operator::GetSubgraphNames) - .def("get_subgraph_builder", &Operator::GetSubgraphBuilder) - .def("get_subgraph", &Operator::GetSubgraph) - .def("get_dynamic_subgraph_builder", &Operator::GetDynamicSubgraphBuilder) - .def("get_dynamic_subgraph", &Operator::GetDynamicSubgraph); -#endif - - py::class_(*m, "GETensor") - .def(py::init<>()) - .def(py::init()) - .def(py::init &>()) - .def(py::init()) - .def("set_tensor_desc", &Tensor::SetTensorDesc) - .def("get_tensor_desc", &Tensor::GetTensorDesc) - // .def("set_data", (graphStatus(Tensor::*)(std::vector &&)) & - // Tensor::SetData) - .def("set_data", - (graphStatus(Tensor::*)(const std::vector &)) & - Tensor::SetData) - .def("set_data", - (graphStatus(Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("set_data", (graphStatus(Tensor::*)(const char *)) & Tensor::SetData) -#else - .def("set_data", - (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData) -#endif - .def("set_data", - (graphStatus(Tensor::*)(const std::vector &)) & - Tensor::SetData) - - .def("get_data", - [](Tensor &ts) -> py::list { - py::list v_data; - uint8_t *data = ts.GetData(); - size_t size = ts.GetSize(); - for (size_t i = 0; i < size; ++i) { - v_data.append(data[i]); - } - return v_data; - }) - .def("get_size", &Tensor::GetSize) - .def("is_valid", &Tensor::IsValid) - .def("clone", &Tensor::Clone); - - py::class_(*m, "GETensorDesc") - .def(py::init<>()) - .def(py::init(), - py::arg("shape"), - py::arg("format") = FORMAT_ND, - py::arg("dt") = DT_FLOAT) - .def(py::init()) - .def("update", - (void(TensorDesc::*)(const Shape &, Format, DataType)) & - TensorDesc::Update, - py::arg("shape"), - py::arg("format") = FORMAT_ND, - py::arg("dt") = DT_FLOAT) - .def("set_shape", &TensorDesc::SetShape) - .def("get_shape", &TensorDesc::GetShape) - .def("set_unknown_dim_num_shape", &TensorDesc::SetUnknownDimNumShape) - .def("set_shape_range", &TensorDesc::SetShapeRange) - .def("get_shape_range", - [](TensorDesc &tensorDesc) -> py::tuple { - std::vector> range; - graphStatus status = tensorDesc.GetShapeRange(range); - return py::make_tuple(range, status); - }) - .def("set_format", &TensorDesc::SetFormat) - .def("get_format", &TensorDesc::GetFormat) - .def("get_origin_shape", &TensorDesc::GetOriginShape) - .def("set_origin_shape", &TensorDesc::SetOriginShape) - .def("set_origin_format", &TensorDesc::SetOriginFormat) - .def("get_origin_format", &TensorDesc::GetOriginFormat) - .def("set_data_type", &TensorDesc::SetDataType) - .def("get_data_type", &TensorDesc::GetDataType) -#ifdef PADDLE_WITH_ASCEND_STRING - .def("set_name", - static_cast( - &TensorDesc::SetName)) - .def("get_name", - static_cast( - &TensorDesc::GetName)) -#else - .def("set_name", &TensorDesc::SetName) - .def("get_name", &TensorDesc::GetName) -#endif - .def("set_size", &TensorDesc::SetSize) - .def("get_size", &TensorDesc::GetSize) - .def("set_real_dim_cnt", &TensorDesc::SetRealDimCnt) - .def("get_real_dim_cnt", &TensorDesc::GetRealDimCnt); - - py::class_(*m, "GEShape") - .def(py::init<>()) - .def(py::init &>()) - .def("get_dim_num", &Shape::GetDimNum) - .def("set_dim", &Shape::SetDim) - .def("get_dim", &Shape::GetDim) - .def("get_dims", &Shape::GetDims) - .def("get_shape_size", &Shape::GetShapeSize); - - py::class_(*m, "GEAttrValue").def(py::init<>()); - - py::class_(*m, "GEOperatorFactory") -#ifdef PADDLE_WITH_ASCEND_STRING - .def_static("create_operator", - static_cast( - &ge::OperatorFactory::CreateOperator)) -#else - .def("create_operator", &OperatorFactory::CreateOperator) -#endif - .def("get_ops_type_list", - []() -> py::tuple { - std::vector all_ops; - graphStatus status = OperatorFactory::GetOpsTypeList(all_ops); - return py::make_tuple(all_ops, status); - }) -#ifdef PADDLE_WITH_ASCEND_STRING - .def_static( - "is_exist_op", - static_cast(&OperatorFactory::IsExistOp)); -#else - .def("is_exist_op", &OperatorFactory::IsExistOp); -#endif -} - -} // namespace pybind -} // namespace paddle -#endif diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h deleted file mode 100644 index 15fb056c90e02..0000000000000 --- a/paddle/fluid/pybind/ascend_wrapper_py.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_ASCEND_CL -#include "pybind11/pybind11.h" -#include "pybind11/stl.h" - -namespace py = pybind11; - -namespace paddle { -namespace pybind { - -void BindAscendGraph(py::module* m); -void BindAscendWrapper(py::module* m); -void BindAscendDevice(py::module* m); - -} // namespace pybind -} // namespace paddle -#endif diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 6b5f5cb003c5d..57b62dc40870d 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -2616,19 +2616,6 @@ void BindImperative(py::module *m_ptr) { py::arg("ring_id")); #endif -#if defined(PADDLE_WITH_ASCEND_CL) - py::class_>( - m, "HCCLParallelContext") - .def(py::init()) - .def("init", [](imperative::HCCLParallelContext &self) { self.Init(); }) - .def("init_with_ring_id", - &imperative::HCCLParallelContext::InitWithRingID, - py::arg("ring_id")); -#endif - #if defined(PADDLE_WITH_CNCL) py::class_ npuplace(m, "NPUPlace", R"DOC( NPUPlace is a descriptor of a device. It represents a NPU device on which a tensor will be allocated and a model will run. - Examples: .. code-block:: python - # required: npu - import paddle place = paddle.NPUPlace(0) - )DOC"); g_npuplace_pytype = reinterpret_cast(npuplace.ptr()); - npuplace - .def("__init__", - [](platform::NPUPlace &self, int dev_id) { -#ifdef PADDLE_WITH_ASCEND_CL - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid NPUPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) { - if (platform::GetNPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use NPU because there is no NPU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid NPUPlace(%d), must inside [0, %d), because NPU " - "number on your machine is %d", - dev_id, - platform::GetNPUDeviceCount(), - platform::GetNPUDeviceCount()); - std::exit(-1); - } - } - new (&self) platform::NPUPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use NPU because you have installed CPU/GPU version " - "PaddlePaddle.\n" - "If you want to use NPU, please try to install NPU version " - "PaddlePaddle by: pip install paddlepaddle-npu\n" - "If you only have CPU, please change NPUPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) + npuplace.def("__init__", [](platform::NPUPlace &self, int dev_id) {}) .def("_type", &PlaceIndex) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 977c99f30fc5f..e306e0338462f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -154,10 +154,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/platform/collective_helper.h" -#endif - #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" @@ -285,13 +281,7 @@ bool IsCompiledWithXPU() { #endif } -bool IsCompiledWithNPU() { -#ifndef PADDLE_WITH_ASCEND_CL - return false; -#else - return true; -#endif -} +bool IsCompiledWithNPU() { return false; } bool IsCompiledWithCustomDevice(std::string device_type) { #ifndef PADDLE_WITH_CUSTOM_DEVICE @@ -1606,13 +1596,9 @@ All parameter, weight, gradient are variables in Paddle. "create", [](paddle::platform::NPUPlace &place) -> paddle::platform::DeviceContext * { -#ifndef PADDLE_WITH_ASCEND_CL PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use NPUPlace in CPU/GPU/XPU version, " "Please recompile or reinstall Paddle with NPU support.")); -#else - return new paddle::platform::NPUDeviceContext(place); -#endif }) .def_static("create", [](paddle::platform::CustomPlace &place) @@ -2338,39 +2324,6 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif -#ifdef PADDLE_WITH_ASCEND_CL - m.def("get_npu_device_count", platform::GetNPUDeviceCount); - m.def("npu_finalize", []() { - platform::HCCLCommContext::Instance().ReleaseHCCLComms(); - - auto &pool = platform::DeviceContextPool::Instance(); - auto devices = platform::GetSelectedNPUDevices(); - for (size_t i = 0; i < devices.size(); ++i) { - platform::NPUDeviceGuard guard(devices[i]); - pool.Get(platform::NPUPlace(devices[i]))->Wait(); - } - platform::AclInstance::Instance().Finalize(); - }); - - py::class_(m, "NPUProfConfigWrapper"); - - m.def("npu_prof_init", platform::NPUProfilerInit); - m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) { - platform::NPUProfilerStart(c.ptr()); - }); - m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) { - platform::NPUProfilerStop(c.ptr()); - }); - m.def("npu_prof_finalize", platform::NPUProfilerFinalize); - m.def("npu_prof_create_config", []() { - return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig()); - }); - - m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) { - platform::NPUProfilerDestroyConfig(c.ptr()); - }); -#endif - #ifdef PADDLE_WITH_IPU m.def("get_ipu_device_count", platform::GetIPUDeviceCount); #endif diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index e2e8d7c8837d9..b854fa37ac0ba 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -139,10 +139,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/platform/collective_helper.h" -#endif - #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index e050fc7c7d544..60f7d83f03ac9 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -299,13 +299,6 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) { auto p = self.place(); paddle::memory::Copy( platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); -#endif - } else if (platform::is_npu_place(self.place())) { -#if defined(PADDLE_WITH_ASCEND_CL) - const T *a = self.data(); - auto p = self.place(); - paddle::memory::Copy( - platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); #endif } else if (platform::is_custom_place(self.place())) { #if defined(PADDLE_WITH_CUSTOM_DEVICE) @@ -350,13 +343,6 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) { T *a = self->mutable_data(p); paddle::memory::Copy( p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); -#endif - } else if (platform::is_npu_place(self->place())) { -#if defined(PADDLE_WITH_ASCEND_CL) - auto p = self->place(); - T *a = self->mutable_data(p); - paddle::memory::Copy( - p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); #endif } else if (platform::is_custom_place(self->place())) { #if defined(PADDLE_WITH_CUSTOM_DEVICE) @@ -427,21 +413,6 @@ void SetTensorFromPyArrayT( PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, " "Please recompile or reinstall Paddle with IPU support.")); -#endif - } else if (paddle::platform::is_npu_place(place)) { -#ifdef PADDLE_WITH_ASCEND_CL - platform::Place tmp_place = place; - platform::NPUDeviceGuard guard(tmp_place.device); - auto dst = self->mutable_data(place); - platform::NPUMemcpySync( - dst, array.data(), array.nbytes(), ACL_MEMCPY_HOST_TO_DEVICE); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &ctx = *pool.Get(place); - ctx.Wait(); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot use NPUPlace in CPU/GPU/XPU version. " - "Please recompile or reinstall Paddle with NPU support.")); #endif } else if (paddle::platform::is_mlu_place(place)) { #ifdef PADDLE_WITH_MLU @@ -1093,39 +1064,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); -#endif - } else if (is_npu_tensor) { -#ifdef PADDLE_WITH_ASCEND_CL - py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); - PADDLE_ENFORCE_EQ(py_arr.writeable(), - true, - platform::errors::InvalidArgument( - "PyArray is not writable, in which case memory leak " - "or double free would occur")); - PADDLE_ENFORCE_EQ( - py_arr.owndata(), - true, - platform::errors::InvalidArgument( - "PyArray does not own data, in which case memory leak " - "or double free would occur")); - - size_t copy_bytes = sizeof_dtype * numel; - auto p = tensor.place(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &ctx = *pool.Get(tensor.place()); - paddle::memory::Copy( - platform::CPUPlace(), - py_arr.mutable_data(), - p, - tensor_buf_ptr, - copy_bytes, - reinterpret_cast(ctx).stream()); - ctx.Wait(); - return py_arr; -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot use NPUPlace in CPU/GPU/XPU version, " - "Please recompile or reinstall Paddle with NPU support.")); #endif } else if (is_mlu_tensor) { #ifdef PADDLE_WITH_MLU diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h index a9e1fc384085a..3804ea984f973 100644 --- a/paddle/phi/backends/device_memory_aligment.h +++ b/paddle/phi/backends/device_memory_aligment.h @@ -19,9 +19,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" -#if defined(PADDLE_WITH_ASCEND_CL) -#include "paddle/phi/backends/npu/npu_info.h" -#endif + #include "paddle/phi/backends/gpu/gpu_info.h" #ifdef PADDLE_WITH_MLU #include "paddle/phi/backends/mlu/mlu_info.h" @@ -44,8 +42,6 @@ inline size_t Alignment(size_t size, alignment = phi::backends::gpu::GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) alignment = alignment; -#elif defined(PADDLE_WITH_ASCEND_CL) - alignment = phi::backends::npu::NPUMinChunkSize(); #elif defined(PADDLE_WITH_MLU) alignment = phi::backends::mlu::MLUMinChunkSize(); #else diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 85826fe1cf79f..5225d746f29f0 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -66,11 +66,6 @@ if(WITH_ROCM) phi_dynload_warprnnt SRCS warprnnt.cc DEPS phi_dynamic_loader warprnnt) -elseif(WITH_ASCEND_CL) - cc_library( - phi_dynload_warpctc - SRCS warpctc.cc - DEPS phi_dynamic_loader warpctc npu_hccl) else() nv_library( phi_dynload_cuda diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index c7869e7eea82c..fc32e6fe35ccb 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -522,24 +522,6 @@ void* GetNCCLDsoHandle() { FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); #endif } -void* GetHCCLDsoHandle() { - std::string warning_msg( - "You may need to install 'hccl2' from Huawei official website: " - "before install PaddlePaddle."); -#if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath( - FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg); -#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) - return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true); - -#elif defined(PADDLE_WITH_ASCEND_CL) - return GetDsoHandleFromSearchPath( - FLAGS_hccl_dir, "libhccl.so", true, {}, warning_msg); -#else - return GetDsoHandleFromSearchPath( - FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); -#endif -} void* GetTensorRtDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h index c8dec39fa8356..e248696e9e689 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.h +++ b/paddle/phi/backends/dynload/dynamic_loader.h @@ -38,7 +38,6 @@ void* GetWarpCTCDsoHandle(); void* GetWarpRNNTDsoHandle(); void* GetFlashAttnDsoHandle(); void* GetNCCLDsoHandle(); -void* GetHCCLDsoHandle(); void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); void* GetLAPACKDsoHandle(); diff --git a/paddle/phi/backends/npu/npu_info.h b/paddle/phi/backends/npu/npu_info.h deleted file mode 100644 index 21206ae0b28f3..0000000000000 --- a/paddle/phi/backends/npu/npu_info.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef PADDLE_WITH_ASCEND_CL - -namespace phi { -namespace backends { -namespace npu { - -//! Get the minimum chunk size for NPU buddy allocator. -inline size_t NPUMinChunkSize() { - // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU, - // though no document specify that explicitly. - // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for - // details. - return 1 << 9; -} - -} // namespace npu -} // namespace backends -} // namespace phi - -#endif diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index b384bed077b27..9cff3acccbd41 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -120,8 +120,7 @@ PADDLE_DEFINE_EXPORTED_bool( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * CUDA related related FLAG @@ -203,37 +202,6 @@ PADDLE_DEFINE_EXPORTED_int64( " epilogue algorithms, default is 0, means disabling exhaustive search."); #endif -#if defined(PADDLE_WITH_ASCEND_CL) -PADDLE_DEFINE_EXPORTED_string( - selected_npus, - "", - "A list of device ids separated by comma, like: 0,1,2,3. " - "This option is useful when doing multi process training and " - "each process have only one device (NPU). If you want to use " - "all visible devices, set this to empty string."); -PADDLE_DEFINE_EXPORTED_bool( - hccl_check_nan, - true, - "Check Nan in tensor before hccl_allreduce_sum otherwise it'll " - "core when meets Nan value"); -PADDLE_DEFINE_EXPORTED_string( - npu_config_path, - "", - "The absolute path of configuration json file, like: /tmp/config.json. " - "If proveided, it will be passed to aclInit()."); -PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, - 1, - "set minmum loss scaling value!"); -PADDLE_DEFINE_EXPORTED_string( - npu_precision_mode, - "", - "NPU operator precision mode, options are 'force_fp32', 'force_fp16', " - "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and " - "'allow_mix_precision'. If you want to use the default mode (" - "allow_fp32_to_fp16), set this to empty string. For more details, " - "please refer to the documents"); -#endif - /* * Kernel related FLAG * Name: FLAGS_enable_api_kernel_fallback @@ -558,8 +526,7 @@ PADDLE_DEFINE_EXPORTED_double( // NOTE(zhiqiu): better to share the flags, otherwise we will have too many // flags. -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU) || \ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ defined(PADDLE_WITH_CUSTOM_DEVICE) /** @@ -837,9 +804,8 @@ PADDLE_DEFINE_EXPORTED_bool(use_fast_math, * Example: * Note: Get host by name time. */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_MLU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ + defined(PADDLE_WITH_HIP) PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120, "The maximum time for get host by name time"); diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h index e2e2ffec1bfee..4a8cbd38d3df7 100644 --- a/paddle/phi/core/utils/visit_place.h +++ b/paddle/phi/core/utils/visit_place.h @@ -52,26 +52,6 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, PADDLE_THROW(phi::errors::Unavailable( ("Paddle is not compiled with XPU. Cannot visit xpu device"))); return typename Visitor::result_type(); -#endif - } - case phi::AllocationType::NPU: { -#ifdef PADDLE_WITH_ASCEND_CL - phi::NPUPlace p(place.GetDeviceId()); - return visitor(p); -#else - PADDLE_THROW(phi::errors::Unavailable( - ("Paddle is not compiled with NPU. Cannot visit npu_pinned"))); - return typename Visitor::result_type(); -#endif - } - case phi::AllocationType::NPUPINNED: { -#ifdef PADDLE_WITH_ASCEND_CL - phi::NPUPinnedPlace p; - return visitor(p); -#else - PADDLE_THROW(phi::errors::Unavailable( - ("Paddle is not compiled with NPU. Cannot visit npu_pinned"))); - return typename Visitor::result_type(); #endif } case phi::AllocationType::IPU: { diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h index 76ed2ccd1a9b7..23731285926da 100644 --- a/paddle/phi/kernels/funcs/interpolate_function.h +++ b/paddle/phi/kernels/funcs/interpolate_function.h @@ -142,13 +142,6 @@ inline std::vector get_new_data_from_tensor( new_data = cpu_starts_tensor.data(); } #endif -#ifdef PADDLE_WITH_ASCEND_CL - if (new_data_tensor->place().GetType() == phi::AllocationType::NPU) { - phi::Copy( - *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor); - new_data = cpu_starts_tensor.data(); - } -#endif #ifdef PADDLE_WITH_XPU if (new_data_tensor->place().GetType() == phi::AllocationType::XPU) { phi::Copy( diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 580aef0ef59af..aca40f83219c9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -15,47 +15,23 @@ function(py_test_modules TARGET_NAME) if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL "")) - if(WITH_ASCEND_CL) - add_test( - NAME ${TARGET_NAME} - COMMAND - ${CMAKE_COMMAND} -E env - PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH} - ${py_test_modules_ENVS} - COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data - ${PYTHON_EXECUTABLE} -m coverage run --branch -p - ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - else() - add_test( - NAME ${TARGET_NAME} - COMMAND - ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python - ${py_test_modules_ENVS} - COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data - ${PYTHON_EXECUTABLE} -m coverage run --branch -p - ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - endif() + add_test( + NAME ${TARGET_NAME} + COMMAND + ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python + ${py_test_modules_ENVS} + COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data + ${PYTHON_EXECUTABLE} -m coverage run --branch -p + ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) else() - if(WITH_ASCEND_CL) - add_test( - NAME ${TARGET_NAME} - COMMAND - ${CMAKE_COMMAND} -E env - PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH} - ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE} - ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - else() - add_test( - NAME ${TARGET_NAME} - COMMAND - ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python - ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE} - ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - endif() + add_test( + NAME ${TARGET_NAME} + COMMAND + ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python + ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE} + ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() if(py_test_modules_SERIAL) diff --git a/test/amp/CMakeLists.txt b/test/amp/CMakeLists.txt index b4d5bfd6b84bf..60cf0f5fa43d2 100755 --- a/test/amp/CMakeLists.txt +++ b/test/amp/CMakeLists.txt @@ -14,47 +14,23 @@ function(py_test_modules TARGET_NAME) if(WITH_COVERAGE AND NOT (WITH_INCREMENTAL_COVERAGE AND "$ENV{PADDLE_GIT_DIFF_PY_FILE}" STREQUAL "")) - if(WITH_ASCEND_CL) - add_test( - NAME ${TARGET_NAME} - COMMAND - ${CMAKE_COMMAND} -E env - PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH} - ${py_test_modules_ENVS} - COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data - ${PYTHON_EXECUTABLE} -m coverage run --branch -p - ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - else() - add_test( - NAME ${TARGET_NAME} - COMMAND - ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python - ${py_test_modules_ENVS} - COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data - ${PYTHON_EXECUTABLE} -m coverage run --branch -p - ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - endif() + add_test( + NAME ${TARGET_NAME} + COMMAND + ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python + ${py_test_modules_ENVS} + COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data + ${PYTHON_EXECUTABLE} -m coverage run --branch -p + ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) else() - if(WITH_ASCEND_CL) - add_test( - NAME ${TARGET_NAME} - COMMAND - ${CMAKE_COMMAND} -E env - PYTHONPATH=${PADDLE_BINARY_DIR}/python:$ENV{PYTHONPATH} - ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE} - ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - else() - add_test( - NAME ${TARGET_NAME} - COMMAND - ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python - ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE} - ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - endif() + add_test( + NAME ${TARGET_NAME} + COMMAND + ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python + ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE} + ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() if(py_test_modules_SERIAL) diff --git a/test/asp/CMakeLists.txt b/test/asp/CMakeLists.txt index b48b833b94602..ab9c17edee0ac 100644 --- a/test/asp/CMakeLists.txt +++ b/test/asp/CMakeLists.txt @@ -13,10 +13,7 @@ foreach(TEST_OP ${TEST_OPS}) endforeach() if(WITH_DISTRIBUTE) - if(WITH_GPU - OR WITH_XPU - OR WITH_ASCEND - OR WITH_ASCEND_CL) + if(WITH_GPU OR WITH_XPU) py_test_modules(test_fleet_with_asp_dynamic MODULES test_fleet_with_asp_dynamic ENVS ${dist_ENVS}) py_test_modules(test_fleet_with_asp_static MODULES