From e2175116855add7a2d8569685f0b2dc994765227 Mon Sep 17 00:00:00 2001 From: XUANBABY Date: Thu, 3 Dec 2020 17:39:30 +0800 Subject: [PATCH 01/10] =?UTF-8?q?=E6=B7=BB=E5=8A=A0rocm=E5=B9=B3=E5=8F=B0?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 15 +- cmake/configure.cmake | 6 +- cmake/external/eigen.cmake | 2 +- cmake/external/pybind11.cmake | 2 +- cmake/external/rocprim.cmake | 49 ----- cmake/flags.cmake | 2 +- cmake/generic.cmake | 59 +++++- cmake/hip.cmake | 104 +++++++--- cmake/operators.cmake | 52 ++++- cmake/third_party.cmake | 5 - paddle/fluid/operators/math/CMakeLists.txt | 4 +- paddle/fluid/platform/dynload/CMakeLists.txt | 19 +- .../fluid/platform/dynload/dynamic_loader.cc | 53 ++++++ .../fluid/platform/dynload/dynamic_loader.h | 8 + paddle/fluid/platform/dynload/hiprand.cc | 30 +++ paddle/fluid/platform/dynload/hiprand.h | 56 ++++++ paddle/fluid/platform/dynload/hiprtc.cc | 36 ++++ paddle/fluid/platform/dynload/hiprtc.h | 64 +++++++ paddle/fluid/platform/dynload/miopen.cc | 70 +++++++ paddle/fluid/platform/dynload/miopen.h | 178 ++++++++++++++++++ paddle/fluid/platform/dynload/rccl.cc | 30 +++ paddle/fluid/platform/dynload/rccl.h | 64 +++++++ paddle/fluid/platform/dynload/rocblas.cc | 40 ++++ paddle/fluid/platform/dynload/rocblas.h | 105 +++++++++++ paddle/fluid/platform/dynload/rocm_driver.cc | 35 ++++ paddle/fluid/platform/dynload/rocm_driver.h | 65 +++++++ paddle/fluid/platform/dynload/rocrand.cc | 30 +++ paddle/fluid/platform/dynload/rocrand.h | 55 ++++++ paddle/fluid/platform/float16.h | 80 ++++---- paddle/fluid/platform/hostdevice.h | 6 +- paddle/fluid/pybind/CMakeLists.txt | 10 +- 31 files changed, 1189 insertions(+), 145 deletions(-) delete mode 100644 cmake/external/rocprim.cmake create mode 100644 paddle/fluid/platform/dynload/hiprand.cc create mode 100644 paddle/fluid/platform/dynload/hiprand.h create mode 100644 paddle/fluid/platform/dynload/hiprtc.cc create mode 100644 paddle/fluid/platform/dynload/hiprtc.h create mode 100644 paddle/fluid/platform/dynload/miopen.cc create mode 100644 paddle/fluid/platform/dynload/miopen.h create mode 100644 paddle/fluid/platform/dynload/rccl.cc create mode 100644 paddle/fluid/platform/dynload/rccl.h create mode 100644 paddle/fluid/platform/dynload/rocblas.cc create mode 100644 paddle/fluid/platform/dynload/rocblas.h create mode 100644 paddle/fluid/platform/dynload/rocm_driver.cc create mode 100644 paddle/fluid/platform/dynload/rocm_driver.h create mode 100644 paddle/fluid/platform/dynload/rocrand.cc create mode 100644 paddle/fluid/platform/dynload/rocrand.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d0cff762e22038..1e0f7a0f095d20 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,7 +124,7 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization and inference-lib generation" OFF) ################################ Internal Configurations ####################################### -option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) +option(WITH_ROCM_PLATFORM "Compile PaddlePaddle with ROCM platform" OFF) option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) @@ -254,10 +254,19 @@ include(configure) # add paddle env configuration include_directories("${PADDLE_SOURCE_DIR}") -if(WITH_AMD_GPU) +if(NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") + set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") +else() + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") + set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed") +endif() +set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + +if(WITH_ROCM_PLATFORM) find_package(HIP) include(hip) -endif(WITH_AMD_GPU) +endif(WITH_ROCM_PLATFORM) if(WITH_ARM) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") diff --git a/cmake/configure.cmake b/cmake/configure.cmake index fc984f5e560ef3..a31981d78d54eb 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -121,10 +121,14 @@ if(WITH_GPU) endif() include_directories(${TENSORRT_INCLUDE_DIR}) endif() -elseif(WITH_AMD_GPU) +elseif(WITH_ROCM_PLATFORM) add_definitions(-DPADDLE_WITH_HIP) + add_definitions(-DEIGEN_USE_HIP) + add_definitions(-D__HIP_PLATFORM_HCC__) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP") + set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP) else() add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index f27dcd06ef8e2e..6d1525be2c9b9f 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -28,7 +28,7 @@ endif() # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen -if(WITH_AMD_GPU) +if(WITH_ROCM_PLATFORM) set(EIGEN_REPOSITORY ${GIT_URL}/sabreshao/hipeigen.git) set(EIGEN_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e) endif() diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index 353cb5c72fdfb9..117c6cde11e65d 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -39,7 +39,7 @@ ExternalProject_Add( # to be modified without triggering incremental compilation, and the # third-party library version changes cannot be incorporated. # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html - UPDATE_COMMAND "" +# UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake deleted file mode 100644 index 6bcecb88e9886d..00000000000000 --- a/cmake/external/rocprim.cmake +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# rocprim is "ROCm Parallel Primitives" for short. -# It is a header-only library providing HIP and HC parallel primitives -# for developing performant GPU-accelerated code on AMD ROCm platform. - -if("x${HCC_HOME}" STREQUAL "x") - set(HCC_HOME "/opt/rocm/hcc") -endif() - -INCLUDE(ExternalProject) - -SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim) -SET(ROCPRIM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocprim) -SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include) - -ExternalProject_Add( - extern_rocprim - ${SHALLOW_CLONE} - GIT_REPOSITORY "${GIT_URL}/ROCmSoftwarePlatform/rocPRIM.git" - GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc - PREFIX ${ROCPRIM_SOURCE_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc - CMAKE_ARGS -DONLY_INSTALL=ON - CMAKE_ARGS -DBUILD_TEST=OFF - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR} - - INSTALL_DIR ${ROCPRIM_INSTALL_DIR} - ${EXTERNAL_PROJECT_LOG_ARGS} -) - -INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR}) - -add_library(rocprim INTERFACE) - -add_dependencies(rocprim extern_rocprim) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index ef7d3f2f5ba9d6..bd4962908d7cda 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -155,7 +155,7 @@ set(COMMON_FLAGS ) if(NOT APPLE) - if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) + if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM_PLATFORM AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 7.3)) set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-format-truncation # Warning in boost gcc 8.2 diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9d0d9e7dc442ee..0cd41043fd5e95 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -267,7 +267,7 @@ function(merge_static_libs TARGET_NAME) endfunction(merge_static_libs) function(check_coverage_opt TARGET_NAME SRCS) - if(WITH_COVERAGE AND WITH_INCREMENTAL_COVERAGE) + if(WITH_COVERAGE) if ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "") if (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL "")) string(REPLACE "," ";" CC_FILE_LIST $ENV{PADDLE_GIT_DIFF_CC_FILE}) @@ -532,12 +532,13 @@ function(nv_test TARGET_NAME) endfunction(nv_test) function(hip_library TARGET_NAME) - if (WITH_AMD_GPU) + if (WITH_ROCM_PLATFORM) set(options STATIC static SHARED shared) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(_sources ${hip_library_SRCS}) + set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) if(_source_files) list(REMOVE_ITEM _sources ${_source_files}) @@ -549,7 +550,7 @@ function(hip_library TARGET_NAME) else() add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) - target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so) + target_link_libraries(${TARGET_NAME} ${ROCM_PATH}/hip/lib/libhip_hcc.so) find_fluid_modules(${TARGET_NAME}) endif() if("${hip_library_DEPS}" MATCHES "ARCHIVE_START") @@ -580,12 +581,59 @@ function(hip_library TARGET_NAME) endif() endfunction(hip_library) +function(hip_library_ops TARGET_NAME) + if (WITH_ROCM_PLATFORM) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(hip_library_ops "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(_sources ${hip_library_ops_SRCS}) + HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() + if(hip_library_ops_SRCS) + if (hip_library_ops_SHARED OR hip_library_ops_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources}) + set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) + else() + add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources}) + set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) + target_link_libraries(${TARGET_NAME} ${ROCM_PATH}/hip/lib/libhip_hcc.so) + find_fluid_modules(${TARGET_NAME}) + endif() + if("${hip_library_ops_DEPS}" MATCHES "ARCHIVE_START") + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). + # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. + target_circle_link_libraries(${TARGET_NAME} ${hip_library_ops_DEPS}) + list(REMOVE_ITEM hip_library_ops_DEPS ARCHIVE_START ARCHIVE_END) + else() + target_link_libraries(${TARGET_NAME} ${hip_library_ops_DEPS}) + endif() + # cpplint code style + foreach(source_file ${hip_library_ops_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND hip_library_ops_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + else(hip_library_ops_SRCS) + if (hip_library_ops_DEPS) + merge_static_libs(${TARGET_NAME} ${hip_library_ops_DEPS}) + else() + message(FATAL "Please specify source file or library in nv_library.") + endif() + endif(hip_library_ops_SRCS) + endif() +endfunction(hip_library_ops) + function(hip_binary TARGET_NAME) - if (WITH_AMD_GPU) + if (WITH_ROCM_PLATFORM) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS}) if(hip_binary_DEPS) target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS}) @@ -599,12 +647,13 @@ function(hip_binary TARGET_NAME) endfunction(hip_binary) function(hip_test TARGET_NAME) - if (WITH_AMD_GPU AND WITH_TESTING) + if (WITH_ROCM_PLATFORM AND WITH_TESTING) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(_sources ${hip_test_SRCS}) + set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) if(_source_files) list(REMOVE_ITEM _sources ${_source_files}) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 27ecd50e886b72..4d1a074e1abb9c 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -1,49 +1,105 @@ -if(NOT WITH_AMD_GPU) +if(NOT WITH_ROCM_PLATFORM) return() endif() -include_directories("/opt/rocm/include") -include_directories("/opt/rocm/hip/include") -include_directories("/opt/rocm/miopen/include") -include_directories("/opt/rocm/hipblas/include") -include_directories("/opt/rocm/hiprand/include") -include_directories("/opt/rocm/rocrand/include") -include_directories("/opt/rocm/rccl/include") -include_directories("/opt/rocm/thrust") +include_directories("${ROCM_PATH}/include") +include_directories("${ROCM_PATH}/hip/include") +include_directories("${ROCM_PATH}/miopen/include") +include_directories("${ROCM_PATH}/hipblas/include") +include_directories("${ROCM_PATH}/rocblas/include") +include_directories("${ROCM_PATH}/hiprand/include") +include_directories("${ROCM_PATH}/rocrand/include") +include_directories("${ROCM_PATH}/rccl/include") -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) +#include_directories("${ROCM_PATH}/thrust") +include_directories("${ROCM_PATH}/rocthrust/include/") +include_directories("${ROCM_PATH}/hipcub/include/") +include_directories("${ROCM_PATH}/rocprim/include/") +include_directories("${ROCM_PATH}/hipsparse/include/") +include_directories("${ROCM_PATH}/rocsparse/include/") +include_directories("${ROCM_PATH}/rocfft/include/") + +set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "") +set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "") +# now default is clang +set(HIP_COMPILER "clang") + +list(APPEND EXTERNAL_LIBS "-L${ROCM_PATH}/lib/ -lhip_hcc") +set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -DEIGEN_USE_HIP -DEIGEN_USE_GPU -D__HIP_NO_HALF_CONVERSIONS__ -std=c++11 --amdgpu-target=gfx906" ) + +if(WITH_RCCL) + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_RCCL") +endif() + +if(NOT WITH_PYTHON) + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_NO_PYTHON") +endif(NOT WITH_PYTHON) + +if(WITH_DSO) + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_USE_DSO") +endif(WITH_DSO) if(WITH_TESTING) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING") + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_TESTING") endif(WITH_TESTING) if(WITH_DISTRIBUTE) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE") + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE") endif(WITH_DISTRIBUTE) if(WITH_GRPC) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_GRPC") endif(WITH_GRPC) if(WITH_MKLDNN) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_MKLDNN") endif(WITH_MKLDNN) -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") +set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") - list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) + list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") - list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) + list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) endif() -if("x${HCC_HOME}" STREQUAL "x") - set(HCC_HOME "/opt/rocm/hcc") -endif() +if("${HIP_COMPILER}" STREQUAL "hcc") + if("x${HCC_HOME}" STREQUAL "x") + set(HCC_HOME "${ROCM_PATH}/hcc") + endif() + + set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -ldl --amdgpu-target=gfx906 ") + set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared --amdgpu-target=gfx906") + set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared --amdgpu-target=gfx906") + +elseif("${HIP_COMPILER}" STREQUAL "clang") + + if("x${HIP_CLANG_PATH}" STREQUAL "x") + set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin") + endif() -set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") -set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared") -set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared") + #Number of parallel jobs by default is 1 + if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS) + set(HIP_CLANG_NUM_PARALLEL_JOBS 1) + endif() + #Add support for parallel build and link + if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") + check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS) + endif() + if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1) + if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS}) + set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS} -Wno-format-nonliteral") + set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS}") + else() + message("clang compiler doesn't support parallel jobs") + endif() + endif() + + # Set the CMake Flags to use the HIP-Clang Compiler. + set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o --amdgpu-target=gfx906") + set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o -shared --amdgpu-target=gfx906" ) + set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} -o -ldl --amdgpu-target=gfx906") +endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 715d324c357fb3..c057bddcd20c70 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -6,11 +6,13 @@ function(op_library TARGET) set(cc_srcs) set(cu_srcs) set(hip_cu_srcs) - set(miopen_hip_cc_srcs) + set(hip_cu_cc_srcs) set(cu_cc_srcs) set(xpu_cc_srcs) set(cudnn_cu_cc_srcs) set(cudnn_cu_srcs) + set(miopen_hip_cu_cc_srcs) + set(miopen_hip_cu_srcs) set(CUDNN_FILE) set(mkldnn_cc_srcs) set(MKLDNN_FILE) @@ -49,10 +51,24 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu) list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu) endif() - if(WITH_AMD_GPU) + if(WITH_ROCM_PLATFORM) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu.cc) + list(APPEND hip_cu_cc_srcs ${TARGET}.hip.cu.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) + list(APPEND hip_cu_srcs ${TARGET}.hip.cu) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu) + set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu + ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) + list(APPEND hip_cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu) + endif() string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc) - list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cu.cc) + list(APPEND miopen_hip_cu_cc_srcs ${MIOPEN_FILE}.hip.cu.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cu) + list(APPEND miopen_hip_cu_srcs ${MIOPEN_FILE}.hip.cu) endif() endif() if(WITH_MKLDNN) @@ -69,16 +85,20 @@ function(op_library TARGET) endif() else() foreach(src ${op_library_SRCS}) - if (${src} MATCHES ".*\\.hip.cu$") + if (WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu$") list(APPEND hip_cu_srcs ${src}) + elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu.cc$") + list(APPEND hip_cu_cc_srcs ${src}) elseif(${src} MATCHES ".*_cudnn_op.cu$") list(APPEND cudnn_cu_srcs ${src}) elseif (${src} MATCHES ".*\\.cu$") list(APPEND cu_srcs ${src}) elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") list(APPEND cudnn_cu_cc_srcs ${src}) - elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$") + elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*_miopen_op.hip.cc$") list(APPEND miopen_hip_cc_srcs ${src}) + elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*_miopen_op.hip.cu$") + list(APPEND miopen_hip_cu_srcs ${src}) elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") list(APPEND mkldnn_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cu.cc$") @@ -114,8 +134,8 @@ function(op_library TARGET) if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) - elseif (WITH_AMD_GPU) - hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + elseif (WITH_ROCM_PLATFORM) + hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS} @@ -191,9 +211,21 @@ function(op_library TARGET) endif() # pybind USE_OP_DEVICE_KERNEL for MIOPEN - if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0) - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") + list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len) + if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0) + if(${TARGET} STREQUAL "activation") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n") + else() + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") + endif() endif() + + # pybind USE_OP_DEVICE_KERNEL for MIOPEN + list(LENGTH miopen_hip_cu_srcs miopen_hip_cu_srcs_len) + if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_srcs_len} GREATER 0) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") + endif() + if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n") diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 1eb2096af91dc9..4102949e26e2fd 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -209,11 +209,6 @@ include(external/warpctc) # download, build, install warpctc list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool) -if(WITH_AMD_GPU) - include(external/rocprim) # download, build, install rocprim - list(APPEND third_party_deps extern_rocprim) -endif() - include(cblas) # find first, then download, build, install openblas if(${CBLAS_PROVIDER} STREQUAL MKLML) list(APPEND third_party_deps extern_mklml) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 384393d9601e37..4e02b861402ac6 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -35,8 +35,8 @@ function(math_library TARGET) list(LENGTH cc_srcs cc_srcs_len) if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) - elseif (WITH_AMD_GPU) - hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) + elseif (WITH_ROCM_PLATFORM AND (${hip_srcs} MATCHES ".*\\.hip.cu$")) + hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) elseif(${cc_srcs_len} GREATER 0) cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) endif() diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 9ea218907a4cde..647bff93122b13 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,6 +1,10 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc) +#hip +if (WITH_ROCM_PLATFORM) + list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc) +endif() # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux. @@ -9,6 +13,12 @@ if (NOT APPLE AND NOT WIN32) if (WITH_NCCL) list(APPEND CUDA_SRCS nccl.cc) endif() + if (WITH_ROCM_PLATFORM) + list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc) + if (WITH_RCCL) + list(APPEND HIP_SRCS rccl.cc) + endif() + endif() endif() if (TENSORRT_FOUND) @@ -19,8 +29,13 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) -nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) -cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +if(WITH_ROCM_PLATFORM) + hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader) + hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +else() + nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) + cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +endif() if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 03cd5814afdb52..dee7c001da8038 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -46,6 +46,24 @@ DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); DEFINE_string(op_dir, "", "Specify path for loading user-defined op library."); +#ifdef PADDLE_WITH_HIP + +DEFINE_string(miopen_dir, "", + "Specify path for loading libcudnn.so. For instance, " + "/usr/local/cudnn/lib. If empty [default], dlopen " + "will search cudnn from LD_LIBRARY_PATH"); + +DEFINE_string(rocm_dir, "", + "Specify path for loading cuda library, such as libcublas, " + "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. " + "If default, dlopen will search cuda from LD_LIBRARY_PATH"); + +DEFINE_string(rccl_dir, "", + "Specify path for loading nccl library, such as libnccl.so. " + "For instance, /usr/local/cuda/lib64. If default, " + "dlopen will search cuda from LD_LIBRARY_PATH"); +#endif + namespace paddle { namespace platform { namespace dynload { @@ -251,6 +269,12 @@ void* GetCublasDsoHandle() { #endif } +#ifdef PADDLE_WITH_HIP +void* GetRocblasDsoHandle() { + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); +} +#endif + void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) std::string mac_warn_meg( @@ -278,6 +302,12 @@ void* GetCUDNNDsoHandle() { #endif } +#ifdef PADDLE_WITH_HIP +void* GetMIOPENDsoHandle() { + return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); +} +#endif + void* GetCUPTIDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false, @@ -299,6 +329,12 @@ void* GetCurandDsoHandle() { #endif } +#ifdef PADDLE_WITH_HIP +void* GetRocrandDsoHandle() { + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); +} +#endif + void* GetCusolverDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); @@ -318,6 +354,12 @@ void* GetNVRTCDsoHandle() { #endif } +#ifdef PADDLE_WITH_HIP +void* GetHIPRTCDsoHandle() { + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so"); +} +#endif + void* GetCUDADsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); @@ -326,6 +368,12 @@ void* GetCUDADsoHandle() { #endif } +#ifdef PADDLE_WITH_HIP +void* GetROCMDsoHandle() { + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so"); +} +#endif + void* GetWarpCTCDsoHandle() { std::string warpctc_dir = ""; if (!s_py_site_pkg_path.path.empty()) { @@ -353,6 +401,11 @@ void* GetNCCLDsoHandle() { warning_msg); #endif } +#if defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) +void* GetRCCLDsoHandle() { + return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so",true); +} +#endif void* GetTensorRtDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 1136184ce1fc9a..2211f7c034c03d 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -38,6 +38,14 @@ void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); void* GetOpDsoHandle(const std::string& dso_name); +#ifdef PADDLE_WITH_HIP +void* GetRocblasDsoHandle(); +void* GetMIOPENDsoHandle(); +void* GetHiprandDsoHandle(); +void* GetHIPRTCDsoHandle(); +void* GetROCMDsoHandle(); +void* GetRCCLDsoHandle(); +#endif void SetPaddleLibPath(const std::string&); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/hiprand.cc b/paddle/fluid/platform/dynload/hiprand.cc new file mode 100644 index 00000000000000..4fb26d0f9c85a1 --- /dev/null +++ b/paddle/fluid/platform/dynload/hiprand.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/hiprand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag hiprand_dso_flag; +void *hiprand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/hiprand.h b/paddle/fluid/platform/dynload/hiprand.h new file mode 100644 index 00000000000000..89cd4179f233c2 --- /dev/null +++ b/paddle/fluid/platform/dynload/hiprand.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT +#include "paddle/fluid/platform/port.h" + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag hiprand_dso_flag; +extern void *hiprand_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + hiprandStatus_t operator()(Args... args) { \ + using hiprandFunc = decltype(&::__name); \ + std::call_once(hiprand_dso_flag, []() { \ + hiprand_dso_handle = paddle::platform::dynload::GetHiprandDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define HIPRAND_RAND_ROUTINE_EACH(__macro) \ + __macro(hiprandCreateGenerator); \ + __macro(hiprandSetStream); \ + __macro(hiprandSetPseudoRandomGeneratorSeed); \ + __macro(hiprandGenerateUniform); \ + __macro(hiprandGenerateUniformDouble); \ + __macro(hiprandGenerateNormal); \ + __macro(hiprandDestroyGenerator); + +HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc new file mode 100644 index 00000000000000..0aa1f270879314 --- /dev/null +++ b/paddle/fluid/platform/dynload/hiprtc.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/hiprtc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag hiprtc_dso_flag; +void* hiprtc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +HIPRTC_ROUTINE_EACH(DEFINE_WRAP); + +bool HasHIPRTC() { + std::call_once(hiprtc_dso_flag, + []() { hiprtc_dso_handle = GetHIPRTCDsoHandle(); }); + return hiprtc_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h new file mode 100644 index 00000000000000..59724045e4bd6d --- /dev/null +++ b/paddle/fluid/platform/dynload/hiprtc.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag hiprtc_dso_flag; +extern void* hiprtc_dso_handle; +extern bool HasHIPRTC(); + +#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using hiprtc_func = decltype(&::__name); \ + std::call_once(hiprtc_dso_flag, []() { \ + hiprtc_dso_handle = paddle::platform::dynload::GetHIPRTCDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + + +/** + * include all needed hiprtc functions + **/ +#define HIPRTC_ROUTINE_EACH(__macro) \ + __macro(hiprtcGetErrorString); \ + __macro(hiprtcCompileProgram); \ + __macro(hiprtcCreateProgram); \ + __macro(hiprtcDestroyProgram); \ + __macro(hiprtcGetCode); \ + __macro(hiprtcGetCodeSize); \ + __macro(hiprtcGetProgramLog); \ + __macro(hiprtcGetProgramLogSize) + +HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc new file mode 100644 index 00000000000000..38041a92a9c896 --- /dev/null +++ b/paddle/fluid/platform/dynload/miopen.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/miopen.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace dynload { +std::once_flag miopen_dso_flag; +void* miopen_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MIOPEN_DNN_ROUTINE_EACH(DEFINE_WRAP); +MIOPEN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP); + +#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R3 +MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R4 +MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_R5 +MIOPEN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_R6 +MIOPEN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_R7 +MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); +#endif + +#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R7 +MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); +#endif + + +bool HasMIOpen() { + std::call_once(miopen_dso_flag, + []() { miopen_dso_handle = GetCUDNNDsoHandle(); }); + return miopen_dso_handle != nullptr; +} + +void EnforceMIOPENLoaded(const char* fn_name) { + PADDLE_ENFORCE_NOT_NULL( + miopen_dso_handle, + platform::errors::PreconditionNotMet( + "Cannot load miopen shared library. Cannot invoke method %s.", fn_name)); +} + + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h new file mode 100644 index 00000000000000..792b39251de8fc --- /dev/null +++ b/paddle/fluid/platform/dynload/miopen.h @@ -0,0 +1,178 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag miopen_dso_flag; +extern void* miopen_dso_handle; +extern bool HasMIOpen(); + + +inline const char* miopenGetErrorString(miopenStatus_t status) { + switch (status) { + case miopenStatusSuccess: + return "MIOPEN_STATUS_SUCCESS"; + case miopenStatusNotInitialized: + return "MIOPEN_STATUS_NOT_INITIALIZED"; + case miopenStatusInvalidValue: + return "MIOPEN_STATUS_INVALID_VALUE"; + case miopenStatusBadParm: + return "MIOPEN_STATUS_BAD_PARAM"; + case miopenStatusAllocFailed: + return "MIOPEN_STATUS_ALLOC_FAILED"; + case miopenStatusInternalError: + return "MIOPEN_STATUS_INTERNAL_ERROR"; + case miopenStatusNotImplemented: + return "MIOPEN_STATUS_NOT_IMPLEMENTED"; + case miopenStatusUnknownError: + default: + return "MIOPEN_STATUS_UNKNOWN_ERROR"; + } +} + +extern void EnforceCUDNNLoaded(const char* fn_name); +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using miopen_func = decltype(&::__name); \ + std::call_once(miopen_dso_flag, []() { \ + miopen_dso_handle = paddle::platform::dynload::GetMIOPENDsoHandle(); \ + }); \ + EnforceCUDNNLoaded(#__name); \ + static void* p_##__name = dlsym(miopen_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + + +/** + * include all needed miopen functions in HPPL + **/ +#define MIOPEN_DNN_ROUTINE_EACH(__macro) \ + __macro(miopenSet4dTensorDescriptor); \ + __macro(miopenSetTensorDescriptor); \ + __macro(miopenInitConvolutionNdDescriptor); \ + __macro(miopenFindConvolutionForwardAlgorithm); \ + __macro(miopenGetConvolutionNdForwardOutputDim); \ + __macro(miopenFindConvolutionBackwardDataAlgorithm); \ + __macro(miopenFindConvolutionBackwardWeightsAlgorithm); \ + __macro(miopenGetTensorDescriptor); \ + __macro(miopenCreateTensorDescriptor); \ + __macro(miopenDestroyTensorDescriptor); \ + __macro(miopenSet2dPoolingDescriptor); \ + __macro(miopenGet2dPoolingDescriptor); \ + __macro(miopenGetPoolingNdForwardOutputDim); \ + __macro(miopenCreateConvolutionDescriptor); \ + __macro(miopenCreatePoolingDescriptor); \ + __macro(miopenDestroyPoolingDescriptor); \ + __macro(miopenPoolingGetWorkSpaceSize); \ + __macro(miopenPoolingGetWorkSpaceSizeV2); \ + __macro(miopenSetNdPoolingDescriptor); \ + __macro(miopenInitConvolutionDescriptor); \ + __macro(miopenDestroyConvolutionDescriptor); \ + __macro(miopenGetConvolutionNdDescriptor); \ + __macro(miopenDeriveBNTensorDescriptor); \ + __macro(miopenCreate); \ + __macro(miopenDestroy); \ + __macro(miopenSetStream); \ + __macro(miopenActivationForward); \ + __macro(miopenActivationBackward); \ + __macro(miopenConvolutionBackwardWeights); \ + __macro(miopenConvolutionForward); \ + __macro(miopenConvolutionBackwardBias); \ + __macro(miopenConvolutionForwardGetWorkSpaceSize); \ + __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \ + __macro(miopenTransformTensor); \ + __macro(miopenPoolingForward); \ + __macro(miopenPoolingBackward); \ + __macro(miopenSoftmaxBackward); \ + __macro(miopenSoftmaxForward); \ + __macro(miopenCreateDropoutDescriptor); \ + __macro(miopenDropoutGetStatesSize); \ + __macro(miopenSetDropoutDescriptor); \ + __macro(miopenCreateRNNDescriptor); \ + __macro(miopenSetRNNDescriptor); \ + __macro(miopenGetRNNParamsSize); \ + __macro(miopenGetRNNWorkspaceSize); \ + __macro(miopenGetRNNTrainingReserveSize); \ + __macro(miopenRNNForwardTraining); \ + __macro(miopenRNNBackwardData); \ + __macro(miopenRNNBackwardWeights); \ + __macro(miopenRNNForwardInference); \ + __macro(miopenDestroyDropoutDescriptor); \ + __macro(miopenDestroyRNNDescriptor); \ + +MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \ + __macro(miopenConvolutionBackwardData); +MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs available after R3: +#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ + __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize); +MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs available after R4: +#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ + __macro(miopenBatchNormalizationForwardTraining); \ + __macro(miopenBatchNormalizationForwardInference); \ + __macro(miopenBatchNormalizationBackward); +MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs in R5 +#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \ + __macro(miopenCreateActivationDescriptor); \ + __macro(miopenSetActivationDescriptor); \ + __macro(miopenGetActivationDescriptor); \ + __macro(miopenDestroyActivationDescriptor); +MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +// APIs in R6 +#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \ + /*__macro(miopenSetRNNDescriptor_v6);*/ +MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \ + __macro(miopenSetConvolutionGroupCount); \ + __macro(miopenCreateCTCLossDescriptor); \ + __macro(miopenDestroyCTCLossDescriptor); \ + __macro(miopenGetCTCLossDescriptor); \ + __macro(miopenSetCTCLossDescriptor); \ + __macro(miopenGetCTCLossWorkspaceSize); \ + __macro(miopenCTCLoss); +MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) + +#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \ + /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \ + __macro(cudnnBatchNormalizationForwardTrainingEx); \ + __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ + __macro(cudnnBatchNormalizationBackwardEx); \ + __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/ +MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc new file mode 100644 index 00000000000000..a3043ead8329ae --- /dev/null +++ b/paddle/fluid/platform/dynload/rccl.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/rccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag rccl_dso_flag; +void *rccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h new file mode 100644 index 00000000000000..ac8d111e8d26dd --- /dev/null +++ b/paddle/fluid/platform/dynload/rccl.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag rccl_dso_flag; +extern void* rccl_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using nccl_func = decltype(&::__name); \ + std::call_once(rccl_dso_flag, []() { \ + rccl_dso_handle = paddle::platform::dynload::GetRCCLDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(rccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define RCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(ncclCommInitAll); \ + __macro(ncclGetUniqueId); \ + __macro(ncclCommInitRank); \ + __macro(ncclCommDestroy); \ + __macro(ncclCommCount); \ + __macro(ncclCommCuDevice); \ + __macro(ncclCommUserRank); \ + __macro(ncclAllReduce); \ + __macro(ncclBcast); \ + __macro(ncclAllGather); \ + __macro(ncclGroupStart); \ + __macro(ncclGroupEnd); \ + __macro(ncclReduce); \ + __macro(ncclReduceScatter); \ + __macro(ncclGetErrorString); + +RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rocblas.cc b/paddle/fluid/platform/dynload/rocblas.cc new file mode 100644 index 00000000000000..ee774195363216 --- /dev/null +++ b/paddle/fluid/platform/dynload/rocblas.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/rocblas.h" + +namespace paddle { +namespace platform { +namespace dynload { +std::once_flag rocblas_dso_flag; +void *rocblas_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +ROCBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); + +#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R2 +ROCBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif + +#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R3 +ROCBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); +#endif + +#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R4 +ROCBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h new file mode 100644 index 00000000000000..858c3ae4d59d3e --- /dev/null +++ b/paddle/fluid/platform/dynload/rocblas.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag rocblas_dso_flag; +extern void *rocblas_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cublas routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using rocblas_func = \ + decltype(::__name(std::declval()...)) (*)(Args...); \ + std::call_once(rocblas_dso_flag, []() { \ + rocblas_dso_handle = paddle::platform::dynload::GetRocblasDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(rocblas_saxpy); \ + __macro(rocblas_daxpy); \ + __macro(rocblas_sscal); \ + __macro(rocblas_dscal); \ + __macro(rocblas_scopy); \ + __macro(rocblas_dcopy); \ + __macro(rocblas_sgemv); \ + __macro(rocblas_dgemv); \ + __macro(rocblas_sgemm); \ + __macro(rocblas_dgemm); \ + __macro(rocblas_hgemm); \ + __macro(rocblas_dgeam); \ + /*__macro(rocblas_gemm_ex); */\ + __macro(rocblas_sgemm_batched); \ + __macro(rocblas_dgemm_batched); \ + __macro(rocblas_cgemm_batched); \ + __macro(rocblas_zgemm_batched); \ + __macro(rocblas_create_handle); \ + __macro(rocblas_destroy_handle); \ + __macro(rocblas_add_stream); \ + __macro(rocblas_set_stream); \ + __macro(rocblas_get_stream); \ + __macro(rocblas_set_pointer_mode); \ + __macro(rocblas_get_pointer_mode); + +ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) + +#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ + __macro(rocblas_sgemm_strided_batched); \ + __macro(rocblas_dgemm_strided_batched); \ + __macro(rocblas_cgemm_strided_batched); \ + __macro(rocblas_zgemm_strided_batched); \ + __macro(rocblas_hgemm_strided_batched); + +ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) + +#define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro) \ + +ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) + +#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ + __macro(rocblas_gemm_batched_ex); \ +// __macro(rocblas_gemm_strided_batched_ex); + +ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) + + +#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc new file mode 100644 index 00000000000000..c26e7f57ecaa2d --- /dev/null +++ b/paddle/fluid/platform/dynload/rocm_driver.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/rocm_driver.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag rocm_dso_flag; +void* rocm_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +ROCM_ROUTINE_EACH(DEFINE_WRAP); + +bool HasROCMDriver() { + std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetROCMDsoHandle(); }); + return rocm_dso_handle != nullptr; +} + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h new file mode 100644 index 00000000000000..7ab870b15f0157 --- /dev/null +++ b/paddle/fluid/platform/dynload/rocm_driver.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag rocm_dso_flag; +extern void* rocm_dso_handle; +extern bool HasROCMDriver(); + +#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using rocm_func = decltype(&::__name); \ + std::call_once(rocm_dso_flag, []() { \ + rocm_dso_handle = paddle::platform::dynload::GetROCMDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(rocm_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +/** + * include all needed cuda driver functions + **/ +#define ROCM_ROUTINE_EACH(__macro) \ + __macro(hipGetErrorString); \ + __macro(hipModuleLoadData); \ + __macro(hipModuleGetFunction); \ + __macro(hipModuleUnload); \ + /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \ + __macro(hipModuleLaunchKernel); \ + __macro(hipLaunchKernel); \ + __macro(hipGetDevice); \ + __macro(hipDevicePrimaryCtxGetState) + +ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rocrand.cc b/paddle/fluid/platform/dynload/rocrand.cc new file mode 100644 index 00000000000000..8e14f88ff9c182 --- /dev/null +++ b/paddle/fluid/platform/dynload/rocrand.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/rocrand.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag curand_dso_flag; +void *curand_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rocrand.h b/paddle/fluid/platform/dynload/rocrand.h new file mode 100644 index 00000000000000..ebb53ede84bc8e --- /dev/null +++ b/paddle/fluid/platform/dynload/rocrand.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include // NOLINT +#include "paddle/fluid/platform/port.h" + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag curand_dso_flag; +extern void *curand_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + rocrand_status operator()(Args... args) { \ + using curandFunc = decltype(&::__name); \ + std::call_once(curand_dso_flag, []() { \ + curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define CURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(rocrand_create_generator); \ + __macro(rocrand_set_stream); \ + __macro(rocrand_generate_uniform); \ + __macro(rocrand_generate_uniform_double); \ + __macro(rocrand_generate_normal); \ + __macro(rocrand_destroy_generator); + +CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index b70a206b7dee62..72c50619e10d16 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -20,6 +20,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_HIP +#define CUDA_VERSION 10000 +#include +#endif #ifdef __GNUC__ #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) @@ -37,6 +41,10 @@ limitations under the License. */ #define PADDLE_CUDA_FP16 #include #endif +#ifdef __HIPCC__ +#define PADDLE_CUDA_FP16 +#include +#endif #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) @@ -81,11 +89,13 @@ struct PADDLE_ALIGN(2) float16 { // Constructors #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit float16(const half& h) { +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(const_cast(&h))->x; #else x = h.x; #endif // CUDA_VERSION >= 9000 +#endif } #endif // PADDLE_CUDA_FP16 @@ -100,7 +110,7 @@ struct PADDLE_ALIGN(2) float16 { #endif HOSTDEVICE inline explicit float16(float val) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__)))) half tmp = __float2half(val); x = *reinterpret_cast(&tmp); @@ -246,7 +256,7 @@ struct PADDLE_ALIGN(2) float16 { #endif HOSTDEVICE inline explicit operator float() const { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__)))) half tmp = *reinterpret_cast(this); return __half2float(tmp); @@ -353,10 +363,10 @@ struct PADDLE_ALIGN(2) float16 { // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in // CUDA 9.0 regarding the half data type. +//xuan[TODO] change for rocm #if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000 - DEVICE inline half operator+(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hadd(a, b); #else float res = static_cast(float16(a)) + static_cast(float16(b)); @@ -365,7 +375,7 @@ DEVICE inline half operator+(const half& a, const half& b) { } DEVICE inline half operator-(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hsub(a, b); #else float res = static_cast(float16(a)) - static_cast(float16(b)); @@ -374,7 +384,7 @@ DEVICE inline half operator-(const half& a, const half& b) { } DEVICE inline half operator*(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hmul(a, b); #else float res = static_cast(float16(a)) * static_cast(float16(b)); @@ -383,7 +393,7 @@ DEVICE inline half operator*(const half& a, const half& b) { } DEVICE inline half operator/(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); @@ -394,7 +404,7 @@ DEVICE inline half operator/(const half& a, const half& b) { } DEVICE inline half operator-(const half& a) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hneg(a); #else float res = -static_cast(float16(a)); @@ -423,7 +433,7 @@ DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT } DEVICE inline bool operator==(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __heq(a, b); #else return static_cast(float16(a)) == static_cast(float16(b)); @@ -431,7 +441,7 @@ DEVICE inline bool operator==(const half& a, const half& b) { } DEVICE inline bool operator!=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hne(a, b); #else return static_cast(float16(a)) != static_cast(float16(b)); @@ -439,7 +449,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) { } DEVICE inline bool operator<(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hlt(a, b); #else return static_cast(float16(a)) < static_cast(float16(b)); @@ -447,7 +457,7 @@ DEVICE inline bool operator<(const half& a, const half& b) { } DEVICE inline bool operator<=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hle(a, b); #else return static_cast(float16(a)) <= static_cast(float16(b)); @@ -455,7 +465,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) { } DEVICE inline bool operator>(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hgt(a, b); #else return static_cast(float16(a)) > static_cast(float16(b)); @@ -463,7 +473,7 @@ DEVICE inline bool operator>(const half& a, const half& b) { } DEVICE inline bool operator>=(const half& a, const half& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hge(a, b); #else return static_cast(float16(a)) >= static_cast(float16(b)); @@ -475,7 +485,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) { // Arithmetic operators for float16 on GPU #if defined(PADDLE_CUDA_FP16) HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return float16(__hadd(half(a), half(b))); #else return float16(static_cast(a) + static_cast(b)); @@ -483,7 +493,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { } HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return float16(__hsub(half(a), half(b))); #else return float16(static_cast(a) - static_cast(b)); @@ -491,7 +501,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { } HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return float16(__hmul(half(a), half(b))); #else return float16(static_cast(a) * static_cast(b)); @@ -499,7 +509,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { } HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__))) // TODO(kexinzhao): check which cuda version starts to support __hdiv float num = __half2float(half(a)); float denom = __half2float(half(b)); @@ -510,7 +520,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { } HOSTDEVICE inline float16 operator-(const float16& a) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return float16(__hneg(half(a))); #else float16 res; @@ -540,7 +550,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT } HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __heq(half(a), half(b)); #else return static_cast(a) == static_cast(b); @@ -548,7 +558,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hne(half(a), half(b)); #else return static_cast(a) != static_cast(b); @@ -556,7 +566,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hlt(half(a), half(b)); #else return static_cast(a) < static_cast(b); @@ -564,7 +574,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hle(half(a), half(b)); #else return static_cast(a) <= static_cast(b); @@ -572,7 +582,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hgt(half(a), half(b)); #else return static_cast(a) > static_cast(b); @@ -580,7 +590,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) return __hge(half(a), half(b)); #else return static_cast(a) >= static_cast(b); @@ -846,7 +856,7 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) { } HOSTDEVICE inline bool(isnan)(const float16& a) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))) return __hisnan(half(a)); #else return (a.x & 0x7fff) > 0x7c00; @@ -936,31 +946,31 @@ struct numeric_limits { static const bool traps = true; static const bool tinyness_before = false; - static paddle::platform::float16(min)() { + HOSTDEVICE static paddle::platform::float16(min)() { return paddle::platform::raw_uint16_to_float16(0x400); } - static paddle::platform::float16 lowest() { + HOSTDEVICE static paddle::platform::float16 lowest() { return paddle::platform::raw_uint16_to_float16(0xfbff); } - static paddle::platform::float16(max)() { + HOSTDEVICE static paddle::platform::float16(max)() { return paddle::platform::raw_uint16_to_float16(0x7bff); } - static paddle::platform::float16 epsilon() { + HOSTDEVICE static paddle::platform::float16 epsilon() { return paddle::platform::raw_uint16_to_float16(0x0800); } - static paddle::platform::float16 round_error() { + HOSTDEVICE static paddle::platform::float16 round_error() { return paddle::platform::float16(0.5); } - static paddle::platform::float16 infinity() { + HOSTDEVICE static paddle::platform::float16 infinity() { return paddle::platform::raw_uint16_to_float16(0x7c00); } - static paddle::platform::float16 quiet_NaN() { + HOSTDEVICE static paddle::platform::float16 quiet_NaN() { return paddle::platform::raw_uint16_to_float16(0x7e00); } - static paddle::platform::float16 signaling_NaN() { + HOSTDEVICE static paddle::platform::float16 signaling_NaN() { return paddle::platform::raw_uint16_to_float16(0x7e00); } - static paddle::platform::float16 denorm_min() { + HOSTDEVICE static paddle::platform::float16 denorm_min() { return paddle::platform::raw_uint16_to_float16(0x1); } }; diff --git a/paddle/fluid/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h index c0dc92a5217640..1ffbbc217e254c 100644 --- a/paddle/fluid/platform/hostdevice.h +++ b/paddle/fluid/platform/hostdevice.h @@ -13,7 +13,11 @@ // limitations under the License. #pragma once -#ifdef __CUDACC__ +#ifdef __HIPCC__ +#include +#endif + +#if (defined(__CUDACC__) || defined(__HIPCC__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index c25b692a4a0c70..bc1ab96528cc73 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -145,11 +145,11 @@ if(WITH_PYTHON) endif(WITH_MKLDNN) endif(WIN32) - if(WITH_AMD_GPU) - hip_library(paddle_pybind SHARED + if(WITH_ROCM_PLATFORM) + cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} - DEPS ARCHIVE_START ${PYBIND_DEPS} - ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END) + DEPS ${PYBIND_DEPS} + ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) else() cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} @@ -158,7 +158,7 @@ if(WITH_PYTHON) if(NOT APPLE AND NOT WIN32) target_link_libraries(paddle_pybind rt) endif(NOT APPLE AND NOT WIN32) - endif(WITH_AMD_GPU) + endif(WITH_ROCM_PLATFORM) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) From c7f4a294059ace125404a42b2162167212f62e73 Mon Sep 17 00:00:00 2001 From: XUANBABY Date: Thu, 3 Dec 2020 17:57:13 +0800 Subject: [PATCH 02/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/external/pybind11.cmake | 1 - cmake/generic.cmake | 2 +- cmake/hip.cmake | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index 117c6cde11e65d..69bd68c2778497 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -39,7 +39,6 @@ ExternalProject_Add( # to be modified without triggering incremental compilation, and the # third-party library version changes cannot be incorporated. # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html -# UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 0cd41043fd5e95..be7adf4dceaeaa 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -267,7 +267,7 @@ function(merge_static_libs TARGET_NAME) endfunction(merge_static_libs) function(check_coverage_opt TARGET_NAME SRCS) - if(WITH_COVERAGE) + if(WITH_COVERAGE AND WITH_INCREMENTAL_COVERAGE) if ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "") if (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL "")) string(REPLACE "," ";" CC_FILE_LIST $ENV{PADDLE_GIT_DIFF_CC_FILE}) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 4d1a074e1abb9c..ac666ec686d163 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -11,7 +11,6 @@ include_directories("${ROCM_PATH}/hiprand/include") include_directories("${ROCM_PATH}/rocrand/include") include_directories("${ROCM_PATH}/rccl/include") -#include_directories("${ROCM_PATH}/thrust") include_directories("${ROCM_PATH}/rocthrust/include/") include_directories("${ROCM_PATH}/hipcub/include/") include_directories("${ROCM_PATH}/rocprim/include/") From 9ecc54cf7d16c64e64dba5635d795e250cc78774 Mon Sep 17 00:00:00 2001 From: XUANBABY Date: Fri, 4 Dec 2020 09:59:43 +0800 Subject: [PATCH 03/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E6=AD=A7=E4=B9=89=E5=B9=B6=E6=B7=BB=E5=8A=A0=E5=A4=87=E6=B3=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../fluid/platform/dynload/dynamic_loader.cc | 18 +++++++++--------- paddle/fluid/platform/dynload/rocblas.h | 2 ++ paddle/fluid/platform/dynload/rocm_driver.h | 1 + 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index dee7c001da8038..137e2ab0f0c2ea 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -49,19 +49,19 @@ DEFINE_string(op_dir, "", "Specify path for loading user-defined op library."); #ifdef PADDLE_WITH_HIP DEFINE_string(miopen_dir, "", - "Specify path for loading libcudnn.so. For instance, " - "/usr/local/cudnn/lib. If empty [default], dlopen " - "will search cudnn from LD_LIBRARY_PATH"); + "Specify path for loading libMIOpen.so. For instance, " + "/opt/rocm/miopen/lib. If empty [default], dlopen " + "will search miopen from LD_LIBRARY_PATH"); DEFINE_string(rocm_dir, "", - "Specify path for loading cuda library, such as libcublas, " - "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. " - "If default, dlopen will search cuda from LD_LIBRARY_PATH"); + "Specify path for loading rocm library, such as librocblas, " + "libcurand, libcusolver. For instance, /opt/rocm/lib. " + "If default, dlopen will search rocm from LD_LIBRARY_PATH"); DEFINE_string(rccl_dir, "", - "Specify path for loading nccl library, such as libnccl.so. " - "For instance, /usr/local/cuda/lib64. If default, " - "dlopen will search cuda from LD_LIBRARY_PATH"); + "Specify path for loading rccl library, such as librccl.so. " + "For instance, /opt/rocm/rccl/lib. If default, " + "dlopen will search rccl from LD_LIBRARY_PATH"); #endif namespace paddle { diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h index 858c3ae4d59d3e..c705396e90db96 100644 --- a/paddle/fluid/platform/dynload/rocblas.h +++ b/paddle/fluid/platform/dynload/rocblas.h @@ -64,6 +64,7 @@ extern void *rocblas_dso_handle; __macro(rocblas_dgemm); \ __macro(rocblas_hgemm); \ __macro(rocblas_dgeam); \ + /*rocblas_gemm_ex function not support at rocm3.5*/ \ /*__macro(rocblas_gemm_ex); */\ __macro(rocblas_sgemm_batched); \ __macro(rocblas_dgemm_batched); \ @@ -94,6 +95,7 @@ ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ __macro(rocblas_gemm_batched_ex); \ +//rocm not support now(rocm3.5) // __macro(rocblas_gemm_strided_batched_ex); ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h index 7ab870b15f0157..37e1c2f0d61793 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.h +++ b/paddle/fluid/platform/dynload/rocm_driver.h @@ -50,6 +50,7 @@ extern bool HasROCMDriver(); __macro(hipModuleLoadData); \ __macro(hipModuleGetFunction); \ __macro(hipModuleUnload); \ + /*rocm3.5 not support the function*/ \ /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \ __macro(hipModuleLaunchKernel); \ __macro(hipLaunchKernel); \ From b3e4c16286e99093f41eb19d2d35be0984f427c2 Mon Sep 17 00:00:00 2001 From: xuanbaby Date: Mon, 7 Dec 2020 12:35:58 +0800 Subject: [PATCH 04/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/fluid/platform/float16.h | 80 ++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 72c50619e10d16..753f0d398c2044 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -110,7 +110,9 @@ struct PADDLE_ALIGN(2) float16 { #endif HOSTDEVICE inline explicit float16(float val) { -#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__)))) +#if ((defined(PADDLE_CUDA_FP16)) && \ + ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \ + (defined(__HIP_DEVICE_COMPILE__)))) half tmp = __float2half(val); x = *reinterpret_cast(&tmp); @@ -256,7 +258,9 @@ struct PADDLE_ALIGN(2) float16 { #endif HOSTDEVICE inline explicit operator float() const { -#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__)))) +#if (defined(PADDLE_CUDA_FP16) && \ + ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \ + (defined(__HIP_DEVICE_COMPILE__)))) half tmp = *reinterpret_cast(this); return __half2float(tmp); @@ -363,10 +367,11 @@ struct PADDLE_ALIGN(2) float16 { // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in // CUDA 9.0 regarding the half data type. -//xuan[TODO] change for rocm +// xuan[TODO] change for rocm #if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000 DEVICE inline half operator+(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hadd(a, b); #else float res = static_cast(float16(a)) + static_cast(float16(b)); @@ -375,7 +380,8 @@ DEVICE inline half operator+(const half& a, const half& b) { } DEVICE inline half operator-(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hsub(a, b); #else float res = static_cast(float16(a)) - static_cast(float16(b)); @@ -384,7 +390,8 @@ DEVICE inline half operator-(const half& a, const half& b) { } DEVICE inline half operator*(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hmul(a, b); #else float res = static_cast(float16(a)) * static_cast(float16(b)); @@ -393,7 +400,8 @@ DEVICE inline half operator*(const half& a, const half& b) { } DEVICE inline half operator/(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); @@ -404,7 +412,8 @@ DEVICE inline half operator/(const half& a, const half& b) { } DEVICE inline half operator-(const half& a) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hneg(a); #else float res = -static_cast(float16(a)); @@ -433,7 +442,8 @@ DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT } DEVICE inline bool operator==(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __heq(a, b); #else return static_cast(float16(a)) == static_cast(float16(b)); @@ -441,7 +451,8 @@ DEVICE inline bool operator==(const half& a, const half& b) { } DEVICE inline bool operator!=(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hne(a, b); #else return static_cast(float16(a)) != static_cast(float16(b)); @@ -449,7 +460,8 @@ DEVICE inline bool operator!=(const half& a, const half& b) { } DEVICE inline bool operator<(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hlt(a, b); #else return static_cast(float16(a)) < static_cast(float16(b)); @@ -457,7 +469,8 @@ DEVICE inline bool operator<(const half& a, const half& b) { } DEVICE inline bool operator<=(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hle(a, b); #else return static_cast(float16(a)) <= static_cast(float16(b)); @@ -465,7 +478,8 @@ DEVICE inline bool operator<=(const half& a, const half& b) { } DEVICE inline bool operator>(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hgt(a, b); #else return static_cast(float16(a)) > static_cast(float16(b)); @@ -473,7 +487,8 @@ DEVICE inline bool operator>(const half& a, const half& b) { } DEVICE inline bool operator>=(const half& a, const half& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hge(a, b); #else return static_cast(float16(a)) >= static_cast(float16(b)); @@ -485,7 +500,8 @@ DEVICE inline bool operator>=(const half& a, const half& b) { // Arithmetic operators for float16 on GPU #if defined(PADDLE_CUDA_FP16) HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return float16(__hadd(half(a), half(b))); #else return float16(static_cast(a) + static_cast(b)); @@ -493,7 +509,8 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { } HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return float16(__hsub(half(a), half(b))); #else return float16(static_cast(a) - static_cast(b)); @@ -501,7 +518,8 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { } HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return float16(__hmul(half(a), half(b))); #else return float16(static_cast(a) * static_cast(b)); @@ -509,7 +527,8 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { } HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \ + (defined(__HIP_DEVICE_COMPILE__))) // TODO(kexinzhao): check which cuda version starts to support __hdiv float num = __half2float(half(a)); float denom = __half2float(half(b)); @@ -520,7 +539,8 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { } HOSTDEVICE inline float16 operator-(const float16& a) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return float16(__hneg(half(a))); #else float16 res; @@ -550,7 +570,8 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT } HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __heq(half(a), half(b)); #else return static_cast(a) == static_cast(b); @@ -558,7 +579,8 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hne(half(a), half(b)); #else return static_cast(a) != static_cast(b); @@ -566,7 +588,8 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hlt(half(a), half(b)); #else return static_cast(a) < static_cast(b); @@ -574,7 +597,8 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hle(half(a), half(b)); #else return static_cast(a) <= static_cast(b); @@ -582,7 +606,8 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hgt(half(a), half(b)); #else return static_cast(a) > static_cast(b); @@ -590,7 +615,8 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { } HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { -#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))) +#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__))) return __hge(half(a), half(b)); #else return static_cast(a) >= static_cast(b); @@ -856,7 +882,9 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) { } HOSTDEVICE inline bool(isnan)(const float16& a) { -#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))) +#if (defined(PADDLE_CUDA_FP16) && \ + ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + (defined(__HIP_DEVICE_COMPILE__)))) return __hisnan(half(a)); #else return (a.x & 0x7fff) > 0x7c00; From ce2b25304592b7188de71e33412680fc3cbf21f1 Mon Sep 17 00:00:00 2001 From: xuanbaby Date: Mon, 7 Dec 2020 19:20:48 +0800 Subject: [PATCH 05/10] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=86=B2=E7=AA=81?= =?UTF-8?q?=E5=90=8E=E7=9A=84=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/operators.cmake | 87 +++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 40 deletions(-) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 764d96e189cbd1..0d9f6c648b3b75 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -1,10 +1,10 @@ -# CMake file `unity_build` is used to handle Unity Build compilation. +#CMake file `unity_build` is used to handle Unity Build compilation. include(unity_build) set(PART_CUDA_KERNEL_FILES) function(op_library TARGET) - # op_library is a function to create op library. The interface is same as - # cc_library. But it handle split GPU/CPU code and link some common library - # for ops. +#op_library is a function to create op library.The interface is same as +#cc_library.But it handle split GPU / CPU code and link some common library +#for ops. set(cc_srcs) set(cu_srcs) set(hip_cu_srcs) @@ -19,7 +19,8 @@ function(op_library TARGET) set(mkldnn_cc_srcs) set(MKLDNN_FILE) set(op_common_deps operator op_registry math_function layer common_infer_shape_functions) - # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build. +#Option `UNITY` is used to specify that operator `TARGET` will compiles with \ + Unity Build. set(options UNITY) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) @@ -121,7 +122,8 @@ function(op_library TARGET) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() if (WIN32) - # remove windows unsupported op, because windows has no nccl, no warpctc such ops. +#remove windows unsupported op, because windows has no nccl, \ + no warpctc such ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() @@ -129,9 +131,10 @@ function(op_library TARGET) endforeach() endif(WIN32) - # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. +#Unity Build relies on global option `WITH_UNITY_BUILD` and local \ + option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) - # Generate the unity target name by the directory where source files located. +#Generate the unity target name by the directory where source files located. string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR}) string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET}) set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity") @@ -147,40 +150,42 @@ function(op_library TARGET) set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) endif() if (WITH_GPU) - # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. +#Unity Build relies on global option `WITH_UNITY_BUILD` and local \ + option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) - # Combine the cc and cu source files. +#Combine the cc and cu source files. compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs}) compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs} ${cu_srcs}) if(TARGET ${UNITY_TARGET}) - # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. +#If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources} ${unity_target_cu_sources}) else() - # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. +#If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. nv_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} ${unity_target_cu_sources} DEPS ${op_library_DEPS} ${op_common_deps}) endif() - # Add alias library to handle dependencies. +#Add alias library to handle dependencies. add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) endif() - elseif (WITH_AMD_GPU) - hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + elseif (WITH_ROCM_PLATFORM) + hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() - # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. +#Unity Build relies on global option `WITH_UNITY_BUILD` and local \ + option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) - # Combine the cc source files. +#Combine the cc source files. compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}) if(TARGET ${UNITY_TARGET}) - # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. +#If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources}) else() - # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. +#If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. cc_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} DEPS ${op_library_DEPS} ${op_common_deps}) endif() - # Add alias library to handle dependencies. +#Add alias library to handle dependencies. add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS} @@ -188,7 +193,7 @@ function(op_library TARGET) endif() endif() - # Define operators that don't need pybind here. +#Define operators that don't need pybind here. foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" @@ -200,12 +205,14 @@ function(op_library TARGET) endif() endforeach() - # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. - # Note that it's enough to just adding one operator to pybind in a *_op.cc file. - # And for detail pybind information, please see generated paddle/pybind/pybind.h. +#The registration of USE_OP, \ + please refer to paddle / fluid / framework / op_registry.h. +#Note that it's enough to just adding one operator to pybind in a *_op.cc file. +#And for detail pybind information, \ + please see generated paddle / pybind / pybind.h. file(READ ${TARGET}.cc TARGET_CONTENT) string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}") - # [ \t\r\n]* is used for blank characters +#[ \t\r\n] * is used for blank characters string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}") if (one_register STREQUAL "") @@ -213,13 +220,13 @@ function(op_library TARGET) else () string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}") string(REPLACE "," "" TARGET "${TARGET}") - # [ \t\r\n]+ is used for blank characters. - # Here we use '+' instead of '*' since it is a REPLACE operation. +#[ \t\r\n] + is used for blank characters. +#Here we use '+' instead of '*' since it is a REPLACE operation. string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}") endif() - # pybind USE_NO_KERNEL_OP - # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel +#pybind USE_NO_KERNEL_OP +#HACK : if REGISTER_OP_CPU_KERNEL presents the operator must have kernel string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") string(REPLACE "_op" "" TARGET "${TARGET}") if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") @@ -227,7 +234,7 @@ function(op_library TARGET) set(pybind_flag 1) endif() - # pybind USE_CPU_ONLY_OP +#pybind USE_CPU_ONLY_OP list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) @@ -240,7 +247,7 @@ function(op_library TARGET) set(pybind_flag 1) endif() - # pybind USE_OP_DEVICE_KERNEL for CUDNN +#pybind USE_OP_DEVICE_KERNEL for CUDNN list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len) if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0) if(${TARGET} STREQUAL "activation") @@ -250,13 +257,13 @@ function(op_library TARGET) endif() endif() - # pybind USE_OP_DEVICE_KERNEL for CUDNN +#pybind USE_OP_DEVICE_KERNEL for CUDNN list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len) if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() - # pybind USE_OP_DEVICE_KERNEL for MIOPEN +#pybind USE_OP_DEVICE_KERNEL for MIOPEN list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len) if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0) if(${TARGET} STREQUAL "activation") @@ -265,8 +272,8 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() endif() - - # pybind USE_OP_DEVICE_KERNEL for MIOPEN + +#pybind USE_OP_DEVICE_KERNEL for MIOPEN list(LENGTH miopen_hip_cu_srcs miopen_hip_cu_srcs_len) if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") @@ -276,9 +283,9 @@ function(op_library TARGET) if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n") endif() - # pybind USE_OP_DEVICE_KERNEL for MKLDNN +#pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) - # Append first implemented MKLDNN activation operator +#Append first implemented MKLDNN activation operator if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") @@ -298,9 +305,9 @@ function(op_library TARGET) endif() endif() - # pybind USE_OP +#pybind USE_OP if (${pybind_flag} EQUAL 0) - # NOTE(*): activation use macro to regist the kernels, set use_op manually. +#NOTE(*) : activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") file(APPEND ${pybind_file} "USE_OP(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") @@ -341,7 +348,7 @@ function(register_operators) endif() endforeach() - # Complete the processing of `UNITY_TARGET`. +#Complete the processing of `UNITY_TARGET`. if(WITH_UNITY_BUILD) finish_unity_target(cc) if(WITH_GPU) From 7add2748e22953aab02ef3cceeb098ab088f4309 Mon Sep 17 00:00:00 2001 From: xuanbaby Date: Tue, 8 Dec 2020 09:52:27 +0800 Subject: [PATCH 06/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9operators.cmake?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/operators.cmake | 93 ++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 55 deletions(-) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 0d9f6c648b3b75..92c99a2df5b5f8 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -1,26 +1,23 @@ -#CMake file `unity_build` is used to handle Unity Build compilation. +# CMake file `unity_build` is used to handle Unity Build compilation. include(unity_build) set(PART_CUDA_KERNEL_FILES) function(op_library TARGET) -#op_library is a function to create op library.The interface is same as -#cc_library.But it handle split GPU / CPU code and link some common library -#for ops. + # op_library is a function to create op library. The interface is same as + # cc_library. But it handle split GPU/CPU code and link some common library + # for ops. set(cc_srcs) set(cu_srcs) set(hip_cu_srcs) - set(hip_cu_cc_srcs) + set(miopen_hip_cc_srcs) set(cu_cc_srcs) set(xpu_cc_srcs) set(cudnn_cu_cc_srcs) set(cudnn_cu_srcs) - set(miopen_hip_cu_cc_srcs) - set(miopen_hip_cu_srcs) set(CUDNN_FILE) set(mkldnn_cc_srcs) set(MKLDNN_FILE) set(op_common_deps operator op_registry math_function layer common_infer_shape_functions) -#Option `UNITY` is used to specify that operator `TARGET` will compiles with \ - Unity Build. + # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build. set(options UNITY) set(oneValueArgs "") set(multiValueArgs SRCS DEPS) @@ -122,8 +119,7 @@ function(op_library TARGET) message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") endif() if (WIN32) -#remove windows unsupported op, because windows has no nccl, \ - no warpctc such ops. + # remove windows unsupported op, because windows has no nccl, no warpctc such ops. foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() @@ -131,10 +127,9 @@ function(op_library TARGET) endforeach() endif(WIN32) -#Unity Build relies on global option `WITH_UNITY_BUILD` and local \ - option `UNITY`. + # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) -#Generate the unity target name by the directory where source files located. + # Generate the unity target name by the directory where source files located. string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR}) string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET}) set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity") @@ -150,42 +145,40 @@ function(op_library TARGET) set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) endif() if (WITH_GPU) -#Unity Build relies on global option `WITH_UNITY_BUILD` and local \ - option `UNITY`. + # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) -#Combine the cc and cu source files. + # Combine the cc and cu source files. compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs}) compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs} ${cu_srcs}) if(TARGET ${UNITY_TARGET}) -#If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. + # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources} ${unity_target_cu_sources}) else() -#If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. + # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. nv_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} ${unity_target_cu_sources} DEPS ${op_library_DEPS} ${op_common_deps}) endif() -#Add alias library to handle dependencies. + # Add alias library to handle dependencies. add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) endif() elseif (WITH_ROCM_PLATFORM) - hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS {op_library_DEPS} ${op_common_deps}) else() -#Unity Build relies on global option `WITH_UNITY_BUILD` and local \ - option `UNITY`. + # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) -#Combine the cc source files. + # Combine the cc source files. compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}) if(TARGET ${UNITY_TARGET}) -#If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. + # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources}) else() -#If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. + # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files. cc_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} DEPS ${op_library_DEPS} ${op_common_deps}) endif() -#Add alias library to handle dependencies. + # Add alias library to handle dependencies. add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS} @@ -193,7 +186,7 @@ function(op_library TARGET) endif() endif() -#Define operators that don't need pybind here. + # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" @@ -205,14 +198,12 @@ function(op_library TARGET) endif() endforeach() -#The registration of USE_OP, \ - please refer to paddle / fluid / framework / op_registry.h. -#Note that it's enough to just adding one operator to pybind in a *_op.cc file. -#And for detail pybind information, \ - please see generated paddle / pybind / pybind.h. + # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. + # Note that it's enough to just adding one operator to pybind in a *_op.cc file. + # And for detail pybind information, please see generated paddle/pybind/pybind.h. file(READ ${TARGET}.cc TARGET_CONTENT) string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}") -#[ \t\r\n] * is used for blank characters + # [ \t\r\n]* is used for blank characters string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}") if (one_register STREQUAL "") @@ -220,13 +211,13 @@ function(op_library TARGET) else () string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}") string(REPLACE "," "" TARGET "${TARGET}") -#[ \t\r\n] + is used for blank characters. -#Here we use '+' instead of '*' since it is a REPLACE operation. + # [ \t\r\n]+ is used for blank characters. + # Here we use '+' instead of '*' since it is a REPLACE operation. string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}") endif() -#pybind USE_NO_KERNEL_OP -#HACK : if REGISTER_OP_CPU_KERNEL presents the operator must have kernel + # pybind USE_NO_KERNEL_OP + # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") string(REPLACE "_op" "" TARGET "${TARGET}") if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") @@ -234,7 +225,7 @@ function(op_library TARGET) set(pybind_flag 1) endif() -#pybind USE_CPU_ONLY_OP + # pybind USE_CPU_ONLY_OP list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) @@ -247,7 +238,7 @@ function(op_library TARGET) set(pybind_flag 1) endif() -#pybind USE_OP_DEVICE_KERNEL for CUDNN + # pybind USE_OP_DEVICE_KERNEL for CUDNN list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len) if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0) if(${TARGET} STREQUAL "activation") @@ -256,14 +247,7 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() endif() - -#pybind USE_OP_DEVICE_KERNEL for CUDNN - list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len) - if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0) - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") - endif() - -#pybind USE_OP_DEVICE_KERNEL for MIOPEN + # pybind USE_OP_DEVICE_KERNEL for MIOPEN list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len) if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0) if(${TARGET} STREQUAL "activation") @@ -273,19 +257,18 @@ function(op_library TARGET) endif() endif() -#pybind USE_OP_DEVICE_KERNEL for MIOPEN + # pybind USE_OP_DEVICE_KERNEL for MIOPEN list(LENGTH miopen_hip_cu_srcs miopen_hip_cu_srcs_len) if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() - if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n") endif() -#pybind USE_OP_DEVICE_KERNEL for MKLDNN + # pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) -#Append first implemented MKLDNN activation operator + # Append first implemented MKLDNN activation operator if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") @@ -305,9 +288,9 @@ function(op_library TARGET) endif() endif() -#pybind USE_OP + # pybind USE_OP if (${pybind_flag} EQUAL 0) -#NOTE(*) : activation use macro to regist the kernels, set use_op manually. + # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") file(APPEND ${pybind_file} "USE_OP(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") @@ -348,7 +331,7 @@ function(register_operators) endif() endforeach() -#Complete the processing of `UNITY_TARGET`. + # Complete the processing of `UNITY_TARGET`. if(WITH_UNITY_BUILD) finish_unity_target(cc) if(WITH_GPU) From 02af637060031b641fd52d022291ccb355654d75 Mon Sep 17 00:00:00 2001 From: xuanbaby Date: Tue, 8 Dec 2020 11:00:35 +0800 Subject: [PATCH 07/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/fluid/platform/dynload/hiprtc.h | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h index 59724045e4bd6d..b29bf6274e31b4 100644 --- a/paddle/fluid/platform/dynload/hiprtc.h +++ b/paddle/fluid/platform/dynload/hiprtc.h @@ -27,21 +27,20 @@ extern std::once_flag hiprtc_dso_flag; extern void* hiprtc_dso_handle; extern bool HasHIPRTC(); -#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using hiprtc_func = decltype(&::__name); \ - std::call_once(hiprtc_dso_flag, []() { \ +#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using hiprtc_func = decltype(&::__name); \ + std::call_once(hiprtc_dso_flag, []() { \ hiprtc_dso_handle = paddle::platform::dynload::GetHIPRTCDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ + }); \ + static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ extern struct DynLoad__##__name __name - /** * include all needed hiprtc functions **/ @@ -50,7 +49,7 @@ extern bool HasHIPRTC(); __macro(hiprtcCompileProgram); \ __macro(hiprtcCreateProgram); \ __macro(hiprtcDestroyProgram); \ - __macro(hiprtcGetCode); \ + __macro(hiprtcGetCode); \ __macro(hiprtcGetCodeSize); \ __macro(hiprtcGetProgramLog); \ __macro(hiprtcGetProgramLogSize) From 9c7ebc9ba2622a2c1cd45d43ee96b3d4ff3fc704 Mon Sep 17 00:00:00 2001 From: xuanbaby Date: Tue, 8 Dec 2020 13:45:39 +0800 Subject: [PATCH 08/10] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/operators.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 92c99a2df5b5f8..c69caa597239a0 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -247,6 +247,13 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() endif() + + # pybind USE_OP_DEVICE_KERNEL for CUDNN + list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len) + if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") + endif() + # pybind USE_OP_DEVICE_KERNEL for MIOPEN list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len) if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0) From 02346a82e3361554bd1dc3f8fc21b377c405923f Mon Sep 17 00:00:00 2001 From: xuanbaby Date: Thu, 10 Dec 2020 14:20:23 +0800 Subject: [PATCH 09/10] =?UTF-8?q?=E7=BB=9F=E4=B8=80=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/operators.cmake | 2 +- .../fluid/platform/dynload/dynamic_loader.cc | 47 ++++------------ .../fluid/platform/dynload/dynamic_loader.h | 8 --- paddle/fluid/platform/dynload/hiprand.h | 24 ++++---- paddle/fluid/platform/dynload/hiprtc.cc | 4 +- paddle/fluid/platform/dynload/hiprtc.h | 28 +++++----- paddle/fluid/platform/dynload/miopen.cc | 4 +- paddle/fluid/platform/dynload/miopen.h | 44 +++++++-------- paddle/fluid/platform/dynload/rccl.h | 2 +- paddle/fluid/platform/dynload/rocblas.h | 26 ++++----- paddle/fluid/platform/dynload/rocm_driver.cc | 4 +- paddle/fluid/platform/dynload/rocm_driver.h | 4 +- paddle/fluid/platform/dynload/rocrand.cc | 30 ---------- paddle/fluid/platform/dynload/rocrand.h | 55 ------------------- 14 files changed, 83 insertions(+), 199 deletions(-) delete mode 100644 paddle/fluid/platform/dynload/rocrand.cc delete mode 100644 paddle/fluid/platform/dynload/rocrand.h diff --git a/cmake/operators.cmake b/cmake/operators.cmake index c69caa597239a0..824daf77519afe 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -164,7 +164,7 @@ function(op_library TARGET) ${op_common_deps}) endif() elseif (WITH_ROCM_PLATFORM) - hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS {op_library_DEPS} + hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 5a719862d24c83..303322a710a7d2 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -264,17 +264,13 @@ void* GetCublasDsoHandle() { #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path}); +#elif PADDLE_WITH_HIP + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif } -#ifdef PADDLE_WITH_HIP -void* GetRocblasDsoHandle() { - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); -} -#endif - void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) std::string mac_warn_meg( @@ -296,18 +292,14 @@ void* GetCUDNNDsoHandle() { "CUDNN version."); return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg); +#elif PADDLE_WITH_HIP + return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); #else return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path}); #endif } -#ifdef PADDLE_WITH_HIP -void* GetMIOPENDsoHandle() { - return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); -} -#endif - void* GetCUPTIDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false, @@ -324,17 +316,13 @@ void* GetCurandDsoHandle() { #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path}); +#elif PADDLE_WITH_HIP + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif } -#ifdef PADDLE_WITH_HIP -void* GetRocrandDsoHandle() { - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); -} -#endif - void* GetCusolverDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); @@ -349,31 +337,23 @@ void* GetCusolverDsoHandle() { void* GetNVRTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); +#elif PADDLE_WITH_HIP + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); #endif } -#ifdef PADDLE_WITH_HIP -void* GetHIPRTCDsoHandle() { - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so"); -} -#endif - void* GetCUDADsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); +#elif PADDLE_WITH_HIP + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so"); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false); #endif } -#ifdef PADDLE_WITH_HIP -void* GetROCMDsoHandle() { - return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so"); -} -#endif - void* GetWarpCTCDsoHandle() { std::string warpctc_dir = ""; if (!s_py_site_pkg_path.path.empty()) { @@ -396,16 +376,13 @@ void* GetNCCLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg); +#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) + return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true); #else return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg); #endif } -#if defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) -void* GetRCCLDsoHandle() { - return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true); -} -#endif void* GetTensorRtDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 2211f7c034c03d..1136184ce1fc9a 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -38,14 +38,6 @@ void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); void* GetOpDsoHandle(const std::string& dso_name); -#ifdef PADDLE_WITH_HIP -void* GetRocblasDsoHandle(); -void* GetMIOPENDsoHandle(); -void* GetHiprandDsoHandle(); -void* GetHIPRTCDsoHandle(); -void* GetROCMDsoHandle(); -void* GetRCCLDsoHandle(); -#endif void SetPaddleLibPath(const std::string&); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/hiprand.h b/paddle/fluid/platform/dynload/hiprand.h index 298787a0436d0c..496e70bb26db68 100644 --- a/paddle/fluid/platform/dynload/hiprand.h +++ b/paddle/fluid/platform/dynload/hiprand.h @@ -26,18 +26,18 @@ namespace dynload { extern std::once_flag hiprand_dso_flag; extern void *hiprand_dso_handle; -#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - hiprandStatus_t operator()(Args... args) { \ - using hiprandFunc = decltype(&::__name); \ - std::call_once(hiprand_dso_flag, []() { \ - hiprand_dso_handle = paddle::platform::dynload::GetHiprandDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + hiprandStatus_t operator()(Args... args) { \ + using hiprandFunc = decltype(&::__name); \ + std::call_once(hiprand_dso_flag, []() { \ + hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(hiprand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ extern DynLoad__##__name __name #define HIPRAND_RAND_ROUTINE_EACH(__macro) \ diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc index 0aa1f270879314..537597aaafd4b4 100644 --- a/paddle/fluid/platform/dynload/hiprtc.cc +++ b/paddle/fluid/platform/dynload/hiprtc.cc @@ -25,9 +25,9 @@ void* hiprtc_dso_handle = nullptr; HIPRTC_ROUTINE_EACH(DEFINE_WRAP); -bool HasHIPRTC() { +bool HasNVRTC() { std::call_once(hiprtc_dso_flag, - []() { hiprtc_dso_handle = GetHIPRTCDsoHandle(); }); + []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); }); return hiprtc_dso_handle != nullptr; } diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h index b29bf6274e31b4..7cc58489fad9c5 100644 --- a/paddle/fluid/platform/dynload/hiprtc.h +++ b/paddle/fluid/platform/dynload/hiprtc.h @@ -25,20 +25,20 @@ namespace dynload { extern std::once_flag hiprtc_dso_flag; extern void* hiprtc_dso_handle; -extern bool HasHIPRTC(); - -#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using hiprtc_func = decltype(&::__name); \ - std::call_once(hiprtc_dso_flag, []() { \ - hiprtc_dso_handle = paddle::platform::dynload::GetHIPRTCDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +extern bool HasNVRTC(); + +#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using hiprtc_func = decltype(&::__name); \ + std::call_once(hiprtc_dso_flag, []() { \ + hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(hiprtc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ extern struct DynLoad__##__name __name /** diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc index 00a1daf34fb89e..1b4bdd2939feb9 100644 --- a/paddle/fluid/platform/dynload/miopen.cc +++ b/paddle/fluid/platform/dynload/miopen.cc @@ -50,13 +50,13 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); #endif -bool HasMIOpen() { +bool HasCUDNN() { std::call_once(miopen_dso_flag, []() { miopen_dso_handle = GetCUDNNDsoHandle(); }); return miopen_dso_handle != nullptr; } -void EnforceMIOPENLoaded(const char* fn_name) { +void EnforceCUDNNLoaded(const char* fn_name) { PADDLE_ENFORCE_NOT_NULL( miopen_dso_handle, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h index e4780c570b71b1..2de6429805c131 100644 --- a/paddle/fluid/platform/dynload/miopen.h +++ b/paddle/fluid/platform/dynload/miopen.h @@ -26,7 +26,7 @@ namespace dynload { extern std::once_flag miopen_dso_flag; extern void* miopen_dso_handle; -extern bool HasMIOpen(); +extern bool HasCUDNN(); inline const char* miopenGetErrorString(miopenStatus_t status) { switch (status) { @@ -51,19 +51,19 @@ inline const char* miopenGetErrorString(miopenStatus_t status) { } extern void EnforceCUDNNLoaded(const char* fn_name); -#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using miopen_func = decltype(&::__name); \ - std::call_once(miopen_dso_flag, []() { \ - miopen_dso_handle = paddle::platform::dynload::GetMIOPENDsoHandle(); \ - }); \ - EnforceCUDNNLoaded(#__name); \ - static void* p_##__name = dlsym(miopen_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using miopen_func = decltype(&::__name); \ + std::call_once(miopen_dso_flag, []() { \ + miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \ + }); \ + EnforceCUDNNLoaded(#__name); \ + static void* p_##__name = dlsym(miopen_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ extern struct DynLoad__##__name __name /** @@ -123,23 +123,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(miopenDestroyDropoutDescriptor); \ __macro(miopenDestroyRNNDescriptor); -MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) #define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \ __macro(miopenConvolutionBackwardData); -MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) // APIs available after R3: #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize); -MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) // APIs available after R4: #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ __macro(miopenBatchNormalizationForwardTraining); \ __macro(miopenBatchNormalizationForwardInference); \ __macro(miopenBatchNormalizationBackward); -MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) // APIs in R5 #define MIOPEN_DNN_ROUTINE_EACH_R5(__macro) \ @@ -147,12 +147,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(miopenSetActivationDescriptor); \ __macro(miopenGetActivationDescriptor); \ __macro(miopenDestroyActivationDescriptor); -MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) // APIs in R6 #define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \ /*__macro(miopenSetRNNDescriptor_v6);*/ -MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) #define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \ __macro(miopenSetConvolutionGroupCount); \ @@ -162,7 +162,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) __macro(miopenSetCTCLossDescriptor); \ __macro(miopenGetCTCLossWorkspaceSize); \ __macro(miopenCTCLoss); -MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \ /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \ @@ -170,7 +170,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx); \ __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \ __macro(cudnnBatchNormalizationBackwardEx); \ __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/ -MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h index ac8d111e8d26dd..1d61e330c248ff 100644 --- a/paddle/fluid/platform/dynload/rccl.h +++ b/paddle/fluid/platform/dynload/rccl.h @@ -32,7 +32,7 @@ extern void* rccl_dso_handle; auto operator()(Args... args) -> decltype(__name(args...)) { \ using nccl_func = decltype(&::__name); \ std::call_once(rccl_dso_flag, []() { \ - rccl_dso_handle = paddle::platform::dynload::GetRCCLDsoHandle(); \ + rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \ }); \ static void* p_##__name = dlsym(rccl_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h index 7d812c4db1bb0d..f78ed00ac63d03 100644 --- a/paddle/fluid/platform/dynload/rocblas.h +++ b/paddle/fluid/platform/dynload/rocblas.h @@ -36,19 +36,19 @@ extern void *rocblas_dso_handle; * * note: default dynamic linked libs */ -#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using rocblas_func = \ - decltype(::__name(std::declval()...)) (*)(Args...); \ - std::call_once(rocblas_dso_flag, []() { \ - rocblas_dso_handle = paddle::platform::dynload::GetRocblasDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using rocblas_func = \ + decltype(::__name(std::declval()...)) (*)(Args...); \ + std::call_once(rocblas_dso_flag, []() { \ + rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(rocblas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ extern DynLoad__##__name __name #define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \ diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc index c26e7f57ecaa2d..89ddd3018ffa69 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.cc +++ b/paddle/fluid/platform/dynload/rocm_driver.cc @@ -25,8 +25,8 @@ void* rocm_dso_handle = nullptr; ROCM_ROUTINE_EACH(DEFINE_WRAP); -bool HasROCMDriver() { - std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetROCMDsoHandle(); }); +bool HasCUDADriver() { + std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); }); return rocm_dso_handle != nullptr; } diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h index 487eed27b40a29..dc9c18e732b0ba 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.h +++ b/paddle/fluid/platform/dynload/rocm_driver.h @@ -26,7 +26,7 @@ namespace dynload { extern std::once_flag rocm_dso_flag; extern void* rocm_dso_handle; -extern bool HasROCMDriver(); +extern bool HasCUDADriver(); #define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name) \ struct DynLoad__##__name { \ @@ -34,7 +34,7 @@ extern bool HasROCMDriver(); auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ using rocm_func = decltype(&::__name); \ std::call_once(rocm_dso_flag, []() { \ - rocm_dso_handle = paddle::platform::dynload::GetROCMDsoHandle(); \ + rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \ }); \ static void* p_##__name = dlsym(rocm_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ diff --git a/paddle/fluid/platform/dynload/rocrand.cc b/paddle/fluid/platform/dynload/rocrand.cc deleted file mode 100644 index 8e14f88ff9c182..00000000000000 --- a/paddle/fluid/platform/dynload/rocrand.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/dynload/rocrand.h" - -namespace paddle { -namespace platform { -namespace dynload { - -std::once_flag curand_dso_flag; -void *curand_dso_handle; - -#define DEFINE_WRAP(__name) DynLoad__##__name __name - -CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP); - -} // namespace dynload -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/rocrand.h b/paddle/fluid/platform/dynload/rocrand.h deleted file mode 100644 index 0ff88855f244ef..00000000000000 --- a/paddle/fluid/platform/dynload/rocrand.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include - -#include // NOLINT -#include "paddle/fluid/platform/port.h" - -#include "paddle/fluid/platform/dynload/dynamic_loader.h" - -namespace paddle { -namespace platform { -namespace dynload { -extern std::once_flag curand_dso_flag; -extern void *curand_dso_handle; - -#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - rocrand_status operator()(Args... args) { \ - using curandFunc = decltype(&::__name); \ - std::call_once(curand_dso_flag, []() { \ - curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \ - }); \ - static void *p_##__name = dlsym(curand_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define CURAND_RAND_ROUTINE_EACH(__macro) \ - __macro(rocrand_create_generator); \ - __macro(rocrand_set_stream); \ - __macro(rocrand_generate_uniform); \ - __macro(rocrand_generate_uniform_double); \ - __macro(rocrand_generate_normal); \ - __macro(rocrand_destroy_generator); - -CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP); - -} // namespace dynload -} // namespace platform -} // namespace paddle From c58044ff782cb3d6c6646f0650946910fa81def3 Mon Sep 17 00:00:00 2001 From: xuanbaby Date: Mon, 14 Dec 2020 11:41:52 +0800 Subject: [PATCH 10/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=97=A5=E6=9C=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/fluid/platform/dynload/hiprtc.cc | 2 +- paddle/fluid/platform/dynload/rocm_driver.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc index 537597aaafd4b4..86a39d08eaa520 100644 --- a/paddle/fluid/platform/dynload/hiprtc.cc +++ b/paddle/fluid/platform/dynload/hiprtc.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc index 89ddd3018ffa69..9ec123b632ffa4 100644 --- a/paddle/fluid/platform/dynload/rocm_driver.cc +++ b/paddle/fluid/platform/dynload/rocm_driver.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.