From e2175116855add7a2d8569685f0b2dc994765227 Mon Sep 17 00:00:00 2001
From: XUANBABY <XUANBABY_1993@163.com>
Date: Thu, 3 Dec 2020 17:39:30 +0800
Subject: [PATCH 01/10] =?UTF-8?q?=E6=B7=BB=E5=8A=A0rocm=E5=B9=B3=E5=8F=B0?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CMakeLists.txt                                |  15 +-
 cmake/configure.cmake                         |   6 +-
 cmake/external/eigen.cmake                    |   2 +-
 cmake/external/pybind11.cmake                 |   2 +-
 cmake/external/rocprim.cmake                  |  49 -----
 cmake/flags.cmake                             |   2 +-
 cmake/generic.cmake                           |  59 +++++-
 cmake/hip.cmake                               | 104 +++++++---
 cmake/operators.cmake                         |  52 ++++-
 cmake/third_party.cmake                       |   5 -
 paddle/fluid/operators/math/CMakeLists.txt    |   4 +-
 paddle/fluid/platform/dynload/CMakeLists.txt  |  19 +-
 .../fluid/platform/dynload/dynamic_loader.cc  |  53 ++++++
 .../fluid/platform/dynload/dynamic_loader.h   |   8 +
 paddle/fluid/platform/dynload/hiprand.cc      |  30 +++
 paddle/fluid/platform/dynload/hiprand.h       |  56 ++++++
 paddle/fluid/platform/dynload/hiprtc.cc       |  36 ++++
 paddle/fluid/platform/dynload/hiprtc.h        |  64 +++++++
 paddle/fluid/platform/dynload/miopen.cc       |  70 +++++++
 paddle/fluid/platform/dynload/miopen.h        | 178 ++++++++++++++++++
 paddle/fluid/platform/dynload/rccl.cc         |  30 +++
 paddle/fluid/platform/dynload/rccl.h          |  64 +++++++
 paddle/fluid/platform/dynload/rocblas.cc      |  40 ++++
 paddle/fluid/platform/dynload/rocblas.h       | 105 +++++++++++
 paddle/fluid/platform/dynload/rocm_driver.cc  |  35 ++++
 paddle/fluid/platform/dynload/rocm_driver.h   |  65 +++++++
 paddle/fluid/platform/dynload/rocrand.cc      |  30 +++
 paddle/fluid/platform/dynload/rocrand.h       |  55 ++++++
 paddle/fluid/platform/float16.h               |  80 ++++----
 paddle/fluid/platform/hostdevice.h            |   6 +-
 paddle/fluid/pybind/CMakeLists.txt            |  10 +-
 31 files changed, 1189 insertions(+), 145 deletions(-)
 delete mode 100644 cmake/external/rocprim.cmake
 create mode 100644 paddle/fluid/platform/dynload/hiprand.cc
 create mode 100644 paddle/fluid/platform/dynload/hiprand.h
 create mode 100644 paddle/fluid/platform/dynload/hiprtc.cc
 create mode 100644 paddle/fluid/platform/dynload/hiprtc.h
 create mode 100644 paddle/fluid/platform/dynload/miopen.cc
 create mode 100644 paddle/fluid/platform/dynload/miopen.h
 create mode 100644 paddle/fluid/platform/dynload/rccl.cc
 create mode 100644 paddle/fluid/platform/dynload/rccl.h
 create mode 100644 paddle/fluid/platform/dynload/rocblas.cc
 create mode 100644 paddle/fluid/platform/dynload/rocblas.h
 create mode 100644 paddle/fluid/platform/dynload/rocm_driver.cc
 create mode 100644 paddle/fluid/platform/dynload/rocm_driver.h
 create mode 100644 paddle/fluid/platform/dynload/rocrand.cc
 create mode 100644 paddle/fluid/platform/dynload/rocrand.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d0cff762e22038..1e0f7a0f095d20 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,7 +124,7 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization and inference-lib generation" OFF)
 ################################ Internal Configurations #######################################
-option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
+option(WITH_ROCM_PLATFORM     "Compile PaddlePaddle with ROCM platform"             OFF)
 option(WITH_NV_JETSON     "Compile PaddlePaddle with NV JETSON"             OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
@@ -254,10 +254,19 @@ include(configure)          # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
-if(WITH_AMD_GPU)
+if(NOT DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
+    set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
+else()
+    set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
+    set(HIP_PATH ${ROCM_PATH}/hip CACHE PATH "Path to which HIP has been installed")
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+
+if(WITH_ROCM_PLATFORM)
     find_package(HIP)
     include(hip)
-endif(WITH_AMD_GPU)
+endif(WITH_ROCM_PLATFORM)
 
 if(WITH_ARM)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index fc984f5e560ef3..a31981d78d54eb 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -121,10 +121,14 @@ if(WITH_GPU)
         endif()
         include_directories(${TENSORRT_INCLUDE_DIR})
     endif()
-elseif(WITH_AMD_GPU)
+elseif(WITH_ROCM_PLATFORM)
     add_definitions(-DPADDLE_WITH_HIP)
+    add_definitions(-DEIGEN_USE_HIP)
+    add_definitions(-D__HIP_PLATFORM_HCC__)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
+    set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP)
 else()
     add_definitions(-DHPPL_STUB_FUNC)
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index f27dcd06ef8e2e..6d1525be2c9b9f 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -28,7 +28,7 @@ endif()
 
 # eigen on cuda9.1 missing header of math_funtions.hpp
 # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-if(WITH_AMD_GPU)
+if(WITH_ROCM_PLATFORM)
     set(EIGEN_REPOSITORY ${GIT_URL}/sabreshao/hipeigen.git)
     set(EIGEN_TAG        7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e)
 endif()
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 353cb5c72fdfb9..117c6cde11e65d 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -39,7 +39,7 @@ ExternalProject_Add(
         # to be modified without triggering incremental compilation, and the
         # third-party library version changes cannot be incorporated.
         # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html
-        UPDATE_COMMAND    ""
+#        UPDATE_COMMAND    ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
         INSTALL_COMMAND   ""
diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake
deleted file mode 100644
index 6bcecb88e9886d..00000000000000
--- a/cmake/external/rocprim.cmake
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# rocprim is "ROCm Parallel Primitives" for short.
-# It is a header-only library providing HIP and HC parallel primitives
-# for developing performant GPU-accelerated code on AMD ROCm platform.
-
-if("x${HCC_HOME}" STREQUAL "x")
-  set(HCC_HOME "/opt/rocm/hcc")
-endif()
-
-INCLUDE(ExternalProject)
-
-SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim)
-SET(ROCPRIM_INSTALL_DIR  ${THIRD_PARTY_PATH}/install/rocprim)
-SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
-
-ExternalProject_Add(
-    extern_rocprim
-    ${SHALLOW_CLONE}
-    GIT_REPOSITORY "${GIT_URL}/ROCmSoftwarePlatform/rocPRIM.git"
-    GIT_TAG        5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc 
-    PREFIX         ${ROCPRIM_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS     -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc
-    CMAKE_ARGS     -DONLY_INSTALL=ON
-    CMAKE_ARGS     -DBUILD_TEST=OFF
-    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR}
-
-    INSTALL_DIR    ${ROCPRIM_INSTALL_DIR}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-)
-
-INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR})
-
-add_library(rocprim INTERFACE)
-
-add_dependencies(rocprim extern_rocprim)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index ef7d3f2f5ba9d6..bd4962908d7cda 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -155,7 +155,7 @@ set(COMMON_FLAGS
 )
 
 if(NOT APPLE)
-    if(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0)
+    if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM_PLATFORM AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 7.3))
         set(COMMON_FLAGS
                 ${COMMON_FLAGS}
                 -Wno-format-truncation # Warning in boost gcc 8.2
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 9d0d9e7dc442ee..0cd41043fd5e95 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,7 +267,7 @@ function(merge_static_libs TARGET_NAME)
 endfunction(merge_static_libs)
 
 function(check_coverage_opt TARGET_NAME SRCS)
-  if(WITH_COVERAGE AND WITH_INCREMENTAL_COVERAGE)
+  if(WITH_COVERAGE)
     if ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "")
       if (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL ""))
         string(REPLACE "," ";" CC_FILE_LIST $ENV{PADDLE_GIT_DIFF_CC_FILE})
@@ -532,12 +532,13 @@ function(nv_test TARGET_NAME)
 endfunction(nv_test)
 
 function(hip_library TARGET_NAME)
-  if (WITH_AMD_GPU)
+  if (WITH_ROCM_PLATFORM)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     set(_sources ${hip_library_SRCS})
+    set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
     HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
     if(_source_files)
       list(REMOVE_ITEM _sources ${_source_files})
@@ -549,7 +550,7 @@ function(hip_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
         set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so)
+        target_link_libraries(${TARGET_NAME} ${ROCM_PATH}/hip/lib/libhip_hcc.so)
         find_fluid_modules(${TARGET_NAME})
       endif()
       if("${hip_library_DEPS}" MATCHES "ARCHIVE_START")
@@ -580,12 +581,59 @@ function(hip_library TARGET_NAME)
   endif()
 endfunction(hip_library)
 
+function(hip_library_ops TARGET_NAME)
+  if (WITH_ROCM_PLATFORM)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_library_ops "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_library_ops_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if(hip_library_ops_SRCS)
+      if (hip_library_ops_SHARED OR hip_library_ops_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+      else()
+        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        target_link_libraries(${TARGET_NAME} ${ROCM_PATH}/hip/lib/libhip_hcc.so)
+        find_fluid_modules(${TARGET_NAME})
+      endif()
+      if("${hip_library_ops_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${hip_library_ops_DEPS})
+        list(REMOVE_ITEM hip_library_ops_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${hip_library_ops_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${hip_library_ops_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND hip_library_ops_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else(hip_library_ops_SRCS)
+      if (hip_library_ops_DEPS)
+        merge_static_libs(${TARGET_NAME} ${hip_library_ops_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(hip_library_ops_SRCS)
+  endif()
+endfunction(hip_library_ops)
+
 function(hip_binary TARGET_NAME)
-  if (WITH_AMD_GPU)
+  if (WITH_ROCM_PLATFORM)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
     hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
     if(hip_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
@@ -599,12 +647,13 @@ function(hip_binary TARGET_NAME)
 endfunction(hip_binary)
 
 function(hip_test TARGET_NAME)
-  if (WITH_AMD_GPU AND WITH_TESTING)
+  if (WITH_ROCM_PLATFORM AND WITH_TESTING)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     set(_sources ${hip_test_SRCS})
+    set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
     HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
     if(_source_files)
       list(REMOVE_ITEM _sources ${_source_files})
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 27ecd50e886b72..4d1a074e1abb9c 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -1,49 +1,105 @@
-if(NOT WITH_AMD_GPU)
+if(NOT WITH_ROCM_PLATFORM)
     return()
 endif()
 
-include_directories("/opt/rocm/include")
-include_directories("/opt/rocm/hip/include")
-include_directories("/opt/rocm/miopen/include")
-include_directories("/opt/rocm/hipblas/include")
-include_directories("/opt/rocm/hiprand/include")
-include_directories("/opt/rocm/rocrand/include")
-include_directories("/opt/rocm/rccl/include")
-include_directories("/opt/rocm/thrust")
+include_directories("${ROCM_PATH}/include")
+include_directories("${ROCM_PATH}/hip/include")
+include_directories("${ROCM_PATH}/miopen/include")
+include_directories("${ROCM_PATH}/hipblas/include")
+include_directories("${ROCM_PATH}/rocblas/include")
+include_directories("${ROCM_PATH}/hiprand/include")
+include_directories("${ROCM_PATH}/rocrand/include")
+include_directories("${ROCM_PATH}/rccl/include")
 
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
+#include_directories("${ROCM_PATH}/thrust")
+include_directories("${ROCM_PATH}/rocthrust/include/")
+include_directories("${ROCM_PATH}/hipcub/include/")
+include_directories("${ROCM_PATH}/rocprim/include/")
+include_directories("${ROCM_PATH}/hipsparse/include/")
+include_directories("${ROCM_PATH}/rocsparse/include/")
+include_directories("${ROCM_PATH}/rocfft/include/")
+
+set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "")
+set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "")
+# now default is clang
+set(HIP_COMPILER "clang")
+
+list(APPEND EXTERNAL_LIBS "-L${ROCM_PATH}/lib/ -lhip_hcc")
+set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -DEIGEN_USE_HIP -DEIGEN_USE_GPU -D__HIP_NO_HALF_CONVERSIONS__ -std=c++11 --amdgpu-target=gfx906" )
+
+if(WITH_RCCL)
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_RCCL")
+endif()
+
+if(NOT WITH_PYTHON)
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_NO_PYTHON")
+endif(NOT WITH_PYTHON)
+
+if(WITH_DSO)
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
 
 if(WITH_TESTING)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_TESTING")
 endif(WITH_TESTING)
 
 if(WITH_DISTRIBUTE)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
 endif(WITH_DISTRIBUTE)
 
 if(WITH_GRPC)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_GRPC")
 endif(WITH_GRPC)
 
 if(WITH_MKLDNN)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DPADDLE_WITH_MKLDNN")
 endif(WITH_MKLDNN)
 
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
+set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
 
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+    list(APPEND HIP_HIPCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+    list(APPEND HIP_HIPCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+    list(APPEND HIP_HIPCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
 endif()
 
-if("x${HCC_HOME}" STREQUAL "x")
-  set(HCC_HOME "/opt/rocm/hcc")
-endif()
+if("${HIP_COMPILER}" STREQUAL "hcc")
+    if("x${HCC_HOME}" STREQUAL "x")
+      set(HCC_HOME "${ROCM_PATH}/hcc")
+    endif()
+
+    set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -ldl --amdgpu-target=gfx906 ")
+    set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906")
+    set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906")
+
+elseif("${HIP_COMPILER}" STREQUAL "clang")
+    
+    if("x${HIP_CLANG_PATH}" STREQUAL "x")
+        set(HIP_CLANG_PATH  "${ROCM_PATH}/llvm/bin")
+    endif()
 
-set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
-set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+    #Number of parallel jobs by default is 1
+    if(NOT DEFINED HIP_CLANG_NUM_PARALLEL_JOBS)
+      set(HIP_CLANG_NUM_PARALLEL_JOBS 1)
+    endif()
+    #Add support for parallel build and link
+    if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+      check_cxx_compiler_flag("-parallel-jobs=1" HIP_CLANG_SUPPORTS_PARALLEL_JOBS)
+    endif()
+    if(HIP_CLANG_NUM_PARALLEL_JOBS GREATER 1)
+      if(${HIP_CLANG_SUPPORTS_PARALLEL_JOBS})
+        set(HIP_CLANG_PARALLEL_BUILD_COMPILE_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS} -Wno-format-nonliteral")
+        set(HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS "-parallel-jobs=${HIP_CLANG_NUM_PARALLEL_JOBS}")
+      else()
+        message("clang compiler doesn't support parallel jobs")
+      endif()
+    endif()
 
+
+    # Set the CMake Flags to use the HIP-Clang Compiler.
+    set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> --amdgpu-target=gfx906")
+    set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared --amdgpu-target=gfx906" )
+    set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HIP_CLANG_PATH} ${HIP_CLANG_PARALLEL_BUILD_LINK_OPTIONS} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -ldl --amdgpu-target=gfx906")
+endif()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 715d324c357fb3..c057bddcd20c70 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -6,11 +6,13 @@ function(op_library TARGET)
     set(cc_srcs)
     set(cu_srcs)
     set(hip_cu_srcs)
-    set(miopen_hip_cc_srcs)
+    set(hip_cu_cc_srcs)
     set(cu_cc_srcs)
     set(xpu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(cudnn_cu_srcs)
+    set(miopen_hip_cu_cc_srcs)
+    set(miopen_hip_cu_srcs)
     set(CUDNN_FILE)
     set(mkldnn_cc_srcs)
     set(MKLDNN_FILE)
@@ -49,10 +51,24 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu)
             list(APPEND cudnn_cu_srcs ${CUDNN_FILE}.cu)
         endif()
-        if(WITH_AMD_GPU)
+        if(WITH_ROCM_PLATFORM)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu.cc)
+                list(APPEND hip_cu_cc_srcs ${TARGET}.hip.cu.cc)
+            endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+                list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+            endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu)
+                set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu
+                        ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
+                list(APPEND hip_cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.hip.cu)
+            endif()
             string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
-                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cu.cc)
+                list(APPEND miopen_hip_cu_cc_srcs ${MIOPEN_FILE}.hip.cu.cc)
+            endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cu)
+                list(APPEND miopen_hip_cu_srcs ${MIOPEN_FILE}.hip.cu)
             endif()
         endif()
         if(WITH_MKLDNN)
@@ -69,16 +85,20 @@ function(op_library TARGET)
         endif()
     else()
         foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.hip.cu$")
+            if (WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu$")
                 list(APPEND hip_cu_srcs ${src})
+            elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*\\.hip.cu.cc$")
+                list(APPEND hip_cu_cc_srcs ${src})
             elseif(${src} MATCHES ".*_cudnn_op.cu$")
                 list(APPEND cudnn_cu_srcs ${src})
             elseif (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
             elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                 list(APPEND cudnn_cu_cc_srcs ${src})
-            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+            elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*_miopen_op.hip.cc$")
                 list(APPEND miopen_hip_cc_srcs ${src})
+            elseif(WITH_ROCM_PLATFORM AND ${src} MATCHES ".*_miopen_op.hip.cu$")
+                list(APPEND miopen_hip_cu_srcs ${src})
             elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
@@ -114,8 +134,8 @@ function(op_library TARGET)
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
-    elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+    elseif (WITH_ROCM_PLATFORM)
+        hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
@@ -191,9 +211,21 @@ function(op_library TARGET)
     endif()
 
     # pybind USE_OP_DEVICE_KERNEL for MIOPEN
-    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len)
+    if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0)
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
+      else()
+       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+      endif()
     endif()
+ 
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    list(LENGTH miopen_hip_cu_srcs miopen_hip_cu_srcs_len)
+    if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_srcs_len} GREATER 0)
+       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+    endif()
+
 
     if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 1eb2096af91dc9..4102949e26e2fd 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -209,11 +209,6 @@ include(external/warpctc)   # download, build, install warpctc
 list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash)
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
 
-if(WITH_AMD_GPU)
-    include(external/rocprim)   # download, build, install rocprim
-    list(APPEND third_party_deps extern_rocprim)
-endif()
-
 include(cblas)              	# find first, then download, build, install openblas
 if(${CBLAS_PROVIDER} STREQUAL MKLML)
     list(APPEND third_party_deps extern_mklml)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 384393d9601e37..4e02b861402ac6 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -35,8 +35,8 @@ function(math_library TARGET)
     list(LENGTH cc_srcs cc_srcs_len)
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_ROCM_PLATFORM AND (${hip_srcs} MATCHES ".*\\.hip.cu$"))
+        hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
     elseif(${cc_srcs_len} GREATER 0)
         cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
     endif()
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 9ea218907a4cde..647bff93122b13 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,10 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc)
+#hip
+if (WITH_ROCM_PLATFORM)
+    list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
+endif()
 
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
@@ -9,6 +13,12 @@ if (NOT APPLE AND NOT WIN32)
   if (WITH_NCCL)
     list(APPEND CUDA_SRCS nccl.cc)
   endif()
+  if (WITH_ROCM_PLATFORM)
+    list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
+    if (WITH_RCCL)
+      list(APPEND HIP_SRCS rccl.cc)
+    endif()
+  endif()
 endif()
 
 if (TENSORRT_FOUND)
@@ -19,8 +29,13 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
-nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
-cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+if(WITH_ROCM_PLATFORM)
+  hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
+  hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+else()
+  nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
+  cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+endif()
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 03cd5814afdb52..dee7c001da8038 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -46,6 +46,24 @@ DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
 
 DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
 
+#ifdef PADDLE_WITH_HIP
+
+DEFINE_string(miopen_dir, "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(rocm_dir, "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
+              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(rccl_dir, "",
+              "Specify path for loading nccl library, such as libnccl.so. "
+              "For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+#endif
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -251,6 +269,12 @@ void* GetCublasDsoHandle() {
 #endif
 }
 
+#ifdef PADDLE_WITH_HIP
+void* GetRocblasDsoHandle() {
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
+}
+#endif
+
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   std::string mac_warn_meg(
@@ -278,6 +302,12 @@ void* GetCUDNNDsoHandle() {
 #endif
 }
 
+#ifdef PADDLE_WITH_HIP
+void* GetMIOPENDsoHandle() {
+  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
+}
+#endif
+
 void* GetCUPTIDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false,
@@ -299,6 +329,12 @@ void* GetCurandDsoHandle() {
 #endif
 }
 
+#ifdef PADDLE_WITH_HIP
+void* GetRocrandDsoHandle() {
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
+}
+#endif
+
 void* GetCusolverDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
@@ -318,6 +354,12 @@ void* GetNVRTCDsoHandle() {
 #endif
 }
 
+#ifdef PADDLE_WITH_HIP
+void* GetHIPRTCDsoHandle() {
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so");
+}
+#endif
+
 void* GetCUDADsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
@@ -326,6 +368,12 @@ void* GetCUDADsoHandle() {
 #endif
 }
 
+#ifdef PADDLE_WITH_HIP
+void* GetROCMDsoHandle() {
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so");
+}
+#endif
+
 void* GetWarpCTCDsoHandle() {
   std::string warpctc_dir = "";
   if (!s_py_site_pkg_path.path.empty()) {
@@ -353,6 +401,11 @@ void* GetNCCLDsoHandle() {
                                     warning_msg);
 #endif
 }
+#if defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+void* GetRCCLDsoHandle() {
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so",true);
+}
+#endif
 
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 1136184ce1fc9a..2211f7c034c03d 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -38,6 +38,14 @@ void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
 
+#ifdef PADDLE_WITH_HIP
+void* GetRocblasDsoHandle();
+void* GetMIOPENDsoHandle();
+void* GetHiprandDsoHandle();
+void* GetHIPRTCDsoHandle();
+void* GetROCMDsoHandle();
+void* GetRCCLDsoHandle();
+#endif
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/hiprand.cc b/paddle/fluid/platform/dynload/hiprand.cc
new file mode 100644
index 00000000000000..4fb26d0f9c85a1
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hiprand.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/hiprand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag hiprand_dso_flag;
+void *hiprand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPRAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/hiprand.h b/paddle/fluid/platform/dynload/hiprand.h
new file mode 100644
index 00000000000000..89cd4179f233c2
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hiprand.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <hiprand.h>
+
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/port.h"
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag hiprand_dso_flag;
+extern void *hiprand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    hiprandStatus_t operator()(Args... args) {                                \
+      using hiprandFunc = decltype(&::__name);                                \
+      std::call_once(hiprand_dso_flag, []() {                                 \
+        hiprand_dso_handle = paddle::platform::dynload::GetHiprandDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(hiprand_dso_handle, #__name);           \
+      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
+  extern DynLoad__##__name __name
+
+#define HIPRAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(hiprandCreateGenerator);              \
+  __macro(hiprandSetStream);                    \
+  __macro(hiprandSetPseudoRandomGeneratorSeed); \
+  __macro(hiprandGenerateUniform);              \
+  __macro(hiprandGenerateUniformDouble);        \
+  __macro(hiprandGenerateNormal);               \
+  __macro(hiprandDestroyGenerator);
+
+HIPRAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc
new file mode 100644
index 00000000000000..0aa1f270879314
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hiprtc.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/hiprtc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag hiprtc_dso_flag;
+void* hiprtc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasHIPRTC() {
+  std::call_once(hiprtc_dso_flag,
+                 []() { hiprtc_dso_handle = GetHIPRTCDsoHandle(); });
+  return hiprtc_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h
new file mode 100644
index 00000000000000..59724045e4bd6d
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <hip/hiprtc.h>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag hiprtc_dso_flag;
+extern void* hiprtc_dso_handle;
+extern bool HasHIPRTC();
+
+#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
+      using hiprtc_func = decltype(&::__name);                              \
+      std::call_once(hiprtc_dso_flag, []() {                                \
+        hiprtc_dso_handle = paddle::platform::dynload::GetHIPRTCDsoHandle(); \
+      });                                                                  \
+      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);          \
+      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);            \
+    }                                                                      \
+  };                                                                       \
+  extern struct DynLoad__##__name __name
+
+
+/**
+ * include all needed hiprtc functions
+ **/
+#define HIPRTC_ROUTINE_EACH(__macro) \
+  __macro(hiprtcGetErrorString);     \
+  __macro(hiprtcCompileProgram);     \
+  __macro(hiprtcCreateProgram);      \
+  __macro(hiprtcDestroyProgram);     \
+  __macro(hiprtcGetCode);             \
+  __macro(hiprtcGetCodeSize);        \
+  __macro(hiprtcGetProgramLog);      \
+  __macro(hiprtcGetProgramLogSize)
+
+HIPRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc
new file mode 100644
index 00000000000000..38041a92a9c896
--- /dev/null
+++ b/paddle/fluid/platform/dynload/miopen.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/miopen.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag miopen_dso_flag;
+void* miopen_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MIOPEN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+MIOPEN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R3
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R4
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R5
+MIOPEN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R6
+MIOPEN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_R7
+MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef MIOPEN_DNN_ROUTINE_EACH_AFTER_R7
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
+#endif
+
+
+bool HasMIOpen() {
+  std::call_once(miopen_dso_flag,
+                 []() { miopen_dso_handle = GetCUDNNDsoHandle(); });
+  return miopen_dso_handle != nullptr;
+}
+
+void EnforceMIOPENLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      miopen_dso_handle,
+      platform::errors::PreconditionNotMet(
+          "Cannot load miopen shared library. Cannot invoke method %s.",          fn_name));
+}
+
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
new file mode 100644
index 00000000000000..792b39251de8fc
--- /dev/null
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -0,0 +1,178 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <glog/logging.h>
+
+#include <miopen/miopen.h>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag miopen_dso_flag;
+extern void* miopen_dso_handle;
+extern bool HasMIOpen();
+
+
+inline const char* miopenGetErrorString(miopenStatus_t status) {
+  switch (status) {
+    case miopenStatusSuccess:
+      return "MIOPEN_STATUS_SUCCESS";
+    case miopenStatusNotInitialized:
+      return "MIOPEN_STATUS_NOT_INITIALIZED";
+    case miopenStatusInvalidValue:
+      return "MIOPEN_STATUS_INVALID_VALUE";
+    case miopenStatusBadParm:
+      return "MIOPEN_STATUS_BAD_PARAM";
+    case miopenStatusAllocFailed:
+      return "MIOPEN_STATUS_ALLOC_FAILED";
+    case miopenStatusInternalError:
+      return "MIOPEN_STATUS_INTERNAL_ERROR";
+    case miopenStatusNotImplemented:
+      return "MIOPEN_STATUS_NOT_IMPLEMENTED";
+    case miopenStatusUnknownError:
+    default:
+      return "MIOPEN_STATUS_UNKNOWN_ERROR";
+  }
+}
+
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
+      using miopen_func = decltype(&::__name);                              \
+      std::call_once(miopen_dso_flag, []() {                                \
+        miopen_dso_handle = paddle::platform::dynload::GetMIOPENDsoHandle(); \
+      });                                                                  \
+      EnforceCUDNNLoaded(#__name);                                         \
+      static void* p_##__name = dlsym(miopen_dso_handle, #__name);          \
+      return reinterpret_cast<miopen_func>(p_##__name)(args...);            \
+    }                                                                      \
+  };                                                                       \
+  extern struct DynLoad__##__name __name
+
+
+/**
+ * include all needed miopen functions in HPPL
+ **/
+#define MIOPEN_DNN_ROUTINE_EACH(__macro)                   \
+  __macro(miopenSet4dTensorDescriptor);                    \
+  __macro(miopenSetTensorDescriptor);                    \
+  __macro(miopenInitConvolutionNdDescriptor);              \
+  __macro(miopenFindConvolutionForwardAlgorithm);         \
+  __macro(miopenGetConvolutionNdForwardOutputDim);         \
+  __macro(miopenFindConvolutionBackwardDataAlgorithm);     \
+  __macro(miopenFindConvolutionBackwardWeightsAlgorithm);  \
+  __macro(miopenGetTensorDescriptor);                      \
+  __macro(miopenCreateTensorDescriptor);                   \
+  __macro(miopenDestroyTensorDescriptor);                  \
+  __macro(miopenSet2dPoolingDescriptor);                   \
+  __macro(miopenGet2dPoolingDescriptor);                   \
+  __macro(miopenGetPoolingNdForwardOutputDim);                   \
+  __macro(miopenCreateConvolutionDescriptor);              \
+  __macro(miopenCreatePoolingDescriptor);                  \
+  __macro(miopenDestroyPoolingDescriptor);                 \
+  __macro(miopenPoolingGetWorkSpaceSize);                  \
+  __macro(miopenPoolingGetWorkSpaceSizeV2);                \
+  __macro(miopenSetNdPoolingDescriptor);                   \
+  __macro(miopenInitConvolutionDescriptor);               \
+  __macro(miopenDestroyConvolutionDescriptor);             \
+  __macro(miopenGetConvolutionNdDescriptor);               \
+  __macro(miopenDeriveBNTensorDescriptor);                 \
+  __macro(miopenCreate);                                   \
+  __macro(miopenDestroy);                                  \
+  __macro(miopenSetStream);                                \
+  __macro(miopenActivationForward);                        \
+  __macro(miopenActivationBackward);                       \
+  __macro(miopenConvolutionBackwardWeights);               \
+  __macro(miopenConvolutionForward);                       \
+  __macro(miopenConvolutionBackwardBias);                  \
+  __macro(miopenConvolutionForwardGetWorkSpaceSize);       \
+  __macro(miopenConvolutionBackwardDataGetWorkSpaceSize);  \
+  __macro(miopenTransformTensor);                          \
+  __macro(miopenPoolingForward);                           \
+  __macro(miopenPoolingBackward);                          \
+  __macro(miopenSoftmaxBackward);                          \
+  __macro(miopenSoftmaxForward);                           \
+  __macro(miopenCreateDropoutDescriptor);                  \
+  __macro(miopenDropoutGetStatesSize);                     \
+  __macro(miopenSetDropoutDescriptor);                     \
+  __macro(miopenCreateRNNDescriptor);                      \
+  __macro(miopenSetRNNDescriptor);                         \
+  __macro(miopenGetRNNParamsSize);                         \
+  __macro(miopenGetRNNWorkspaceSize);                      \
+  __macro(miopenGetRNNTrainingReserveSize);                \
+  __macro(miopenRNNForwardTraining);                       \
+  __macro(miopenRNNBackwardData);                          \
+  __macro(miopenRNNBackwardWeights);                       \
+  __macro(miopenRNNForwardInference);                      \
+  __macro(miopenDestroyDropoutDescriptor);                 \
+  __macro(miopenDestroyRNNDescriptor);                     \
+
+MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(miopenConvolutionBackwardData);   
+MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs available after R3:
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+  __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);  
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs available after R4:
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(miopenBatchNormalizationForwardTraining);  \
+  __macro(miopenBatchNormalizationForwardInference); \
+  __macro(miopenBatchNormalizationBackward);
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs in R5
+#define MIOPEN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(miopenCreateActivationDescriptor); \
+  __macro(miopenSetActivationDescriptor);    \
+  __macro(miopenGetActivationDescriptor);    \
+  __macro(miopenDestroyActivationDescriptor);
+MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs in R6
+#define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
+  /*__macro(miopenSetRNNDescriptor_v6);*/
+MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_R7(__macro)                \
+  __macro(miopenSetConvolutionGroupCount);                 \
+  __macro(miopenCreateCTCLossDescriptor);                  \
+  __macro(miopenDestroyCTCLossDescriptor);                 \
+  __macro(miopenGetCTCLossDescriptor);                     \
+  __macro(miopenSetCTCLossDescriptor);                     \
+  __macro(miopenGetCTCLossWorkspaceSize);                  \
+  __macro(miopenCTCLoss);                                  
+MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
+  /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
+  __macro(cudnnBatchNormalizationForwardTrainingEx);                 \
+  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
+  __macro(cudnnBatchNormalizationBackwardEx);                        \
+  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc
new file mode 100644
index 00000000000000..a3043ead8329ae
--- /dev/null
+++ b/paddle/fluid/platform/dynload/rccl.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/rccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag rccl_dso_flag;
+void *rccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h
new file mode 100644
index 00000000000000..ac8d111e8d26dd
--- /dev/null
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <rccl.h>
+
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag rccl_dso_flag;
+extern void* rccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_RCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using nccl_func = decltype(&::__name);                             \
+      std::call_once(rccl_dso_flag, []() {                               \
+        rccl_dso_handle = paddle::platform::dynload::GetRCCLDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(rccl_dso_handle, #__name);         \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
+  extern DynLoad__##__name __name
+
+#define RCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
+  __macro(ncclReduce);                  \
+  __macro(ncclReduceScatter);           \
+  __macro(ncclGetErrorString);
+
+RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rocblas.cc b/paddle/fluid/platform/dynload/rocblas.cc
new file mode 100644
index 00000000000000..ee774195363216
--- /dev/null
+++ b/paddle/fluid/platform/dynload/rocblas.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/rocblas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag rocblas_dso_flag;
+void *rocblas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+ROCBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R2
+ROCBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R3
+ROCBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
+#ifdef ROCBLAS_BLAS_ROUTINE_EACH_R4
+ROCBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h
new file mode 100644
index 00000000000000..858c3ae4d59d3e
--- /dev/null
+++ b/paddle/fluid/platform/dynload/rocblas.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <rocblas.h>
+#include <hip/hip_runtime.h>
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag rocblas_dso_flag;
+extern void *rocblas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
+      using rocblas_func =                                                    \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);          \
+      std::call_once(rocblas_dso_flag, []() {                                 \
+        rocblas_dso_handle = paddle::platform::dynload::GetRocblasDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(rocblas_dso_handle, #__name);           \
+      return reinterpret_cast<rocblas_func>(p_##__name)(args...);             \
+    }                                                                        \
+  };                                                                         \
+  extern DynLoad__##__name __name
+
+#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(rocblas_saxpy);                \
+  __macro(rocblas_daxpy);                \
+  __macro(rocblas_sscal);                \
+  __macro(rocblas_dscal);                \
+  __macro(rocblas_scopy);                \
+  __macro(rocblas_dcopy);                \
+  __macro(rocblas_sgemv);                \
+  __macro(rocblas_dgemv);                \
+  __macro(rocblas_sgemm);                \
+  __macro(rocblas_dgemm);                \
+  __macro(rocblas_hgemm);                   \
+  __macro(rocblas_dgeam);                   \
+  /*__macro(rocblas_gemm_ex);                 */\
+  __macro(rocblas_sgemm_batched);            \
+  __macro(rocblas_dgemm_batched);            \
+  __macro(rocblas_cgemm_batched);            \
+  __macro(rocblas_zgemm_batched);            \
+  __macro(rocblas_create_handle);            \
+  __macro(rocblas_destroy_handle);           \
+  __macro(rocblas_add_stream);               \
+  __macro(rocblas_set_stream);               \
+  __macro(rocblas_get_stream);               \
+  __macro(rocblas_set_pointer_mode);         \
+  __macro(rocblas_get_pointer_mode);         
+
+ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(rocblas_sgemm_strided_batched);        \
+  __macro(rocblas_dgemm_strided_batched);        \
+  __macro(rocblas_cgemm_strided_batched);        \
+  __macro(rocblas_zgemm_strided_batched);        \
+  __macro(rocblas_hgemm_strided_batched);
+
+ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+#define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
+
+ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(rocblas_gemm_batched_ex);              \
+//  __macro(rocblas_gemm_strided_batched_ex);
+
+ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc
new file mode 100644
index 00000000000000..c26e7f57ecaa2d
--- /dev/null
+++ b/paddle/fluid/platform/dynload/rocm_driver.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/rocm_driver.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag rocm_dso_flag;
+void* rocm_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+ROCM_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasROCMDriver() {
+  std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetROCMDsoHandle(); });
+  return rocm_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h
new file mode 100644
index 00000000000000..7ab870b15f0157
--- /dev/null
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag rocm_dso_flag;
+extern void* rocm_dso_handle;
+extern bool HasROCMDriver();
+
+#define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
+      using rocm_func = decltype(&::__name);                             \
+      std::call_once(rocm_dso_flag, []() {                               \
+        rocm_dso_handle = paddle::platform::dynload::GetROCMDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(rocm_dso_handle, #__name);         \
+      return reinterpret_cast<rocm_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed cuda driver functions
+ **/
+#define ROCM_ROUTINE_EACH(__macro)                      \
+  __macro(hipGetErrorString);                            \
+  __macro(hipModuleLoadData);                            \
+  __macro(hipModuleGetFunction);                         \
+  __macro(hipModuleUnload);                              \
+ /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \
+  __macro(hipModuleLaunchKernel);                              \
+  __macro(hipLaunchKernel);                              \
+  __macro(hipGetDevice);                                 \
+  __macro(hipDevicePrimaryCtxGetState)
+
+ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_ROCM_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rocrand.cc b/paddle/fluid/platform/dynload/rocrand.cc
new file mode 100644
index 00000000000000..8e14f88ff9c182
--- /dev/null
+++ b/paddle/fluid/platform/dynload/rocrand.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/rocrand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rocrand.h b/paddle/fluid/platform/dynload/rocrand.h
new file mode 100644
index 00000000000000..ebb53ede84bc8e
--- /dev/null
+++ b/paddle/fluid/platform/dynload/rocrand.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <rocrand.h>
+
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/port.h"
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    rocrand_status operator()(Args... args) {                                \
+      using curandFunc = decltype(&::__name);                                \
+      std::call_once(curand_dso_flag, []() {                                 \
+        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
+  extern DynLoad__##__name __name
+
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(rocrand_create_generator);              \
+  __macro(rocrand_set_stream);                    \
+  __macro(rocrand_generate_uniform);              \
+  __macro(rocrand_generate_uniform_double);        \
+  __macro(rocrand_generate_normal);               \
+  __macro(rocrand_destroy_generator);
+
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index b70a206b7dee62..72c50619e10d16 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -20,6 +20,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_HIP
+#define CUDA_VERSION 10000
+#include <hip/hip_runtime.h>
+#endif
 
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
@@ -37,6 +41,10 @@ limitations under the License. */
 #define PADDLE_CUDA_FP16
 #include <cuda_fp16.h>
 #endif
+#ifdef __HIPCC__
+#define PADDLE_CUDA_FP16
+#include <hip/hip_fp16.h>
+#endif
 
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
@@ -81,11 +89,13 @@ struct PADDLE_ALIGN(2) float16 {
 // Constructors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
 #if CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
     x = h.x;
 #endif  // CUDA_VERSION >= 9000
+#endif
   }
 #endif  // PADDLE_CUDA_FP16
 
@@ -100,7 +110,7 @@ struct PADDLE_ALIGN(2) float16 {
 #endif
 
   HOSTDEVICE inline explicit float16(float val) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__))))
     half tmp = __float2half(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
 
@@ -246,7 +256,7 @@ struct PADDLE_ALIGN(2) float16 {
 #endif
 
   HOSTDEVICE inline explicit operator float() const {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__))))
     half tmp = *reinterpret_cast<const half*>(this);
     return __half2float(tmp);
 
@@ -353,10 +363,10 @@ struct PADDLE_ALIGN(2) float16 {
 // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
 // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
 // CUDA 9.0 regarding the half data type.
+//xuan[TODO] change for rocm
 #if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
-
 DEVICE inline half operator+(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hadd(a, b);
 #else
   float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
@@ -365,7 +375,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
 }
 
 DEVICE inline half operator-(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hsub(a, b);
 #else
   float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
@@ -374,7 +384,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
 }
 
 DEVICE inline half operator*(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hmul(a, b);
 #else
   float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
@@ -383,7 +393,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
 }
 
 DEVICE inline half operator/(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   float num = __half2float(a);
   float denom = __half2float(b);
   return __float2half(num / denom);
@@ -394,7 +404,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
 }
 
 DEVICE inline half operator-(const half& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hneg(a);
 #else
   float res = -static_cast<float>(float16(a));
@@ -423,7 +433,7 @@ DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
 }
 
 DEVICE inline bool operator==(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __heq(a, b);
 #else
   return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
@@ -431,7 +441,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator!=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hne(a, b);
 #else
   return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
@@ -439,7 +449,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator<(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hlt(a, b);
 #else
   return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
@@ -447,7 +457,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator<=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hle(a, b);
 #else
   return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
@@ -455,7 +465,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator>(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hgt(a, b);
 #else
   return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
@@ -463,7 +473,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator>=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hge(a, b);
 #else
   return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
@@ -475,7 +485,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 // Arithmetic operators for float16 on GPU
 #if defined(PADDLE_CUDA_FP16)
 HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return float16(__hadd(half(a), half(b)));
 #else
   return float16(static_cast<float>(a) + static_cast<float>(b));
@@ -483,7 +493,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return float16(__hsub(half(a), half(b)));
 #else
   return float16(static_cast<float>(a) - static_cast<float>(b));
@@ -491,7 +501,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return float16(__hmul(half(a), half(b)));
 #else
   return float16(static_cast<float>(a) * static_cast<float>(b));
@@ -499,7 +509,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__)))
   // TODO(kexinzhao): check which cuda version starts to support __hdiv
   float num = __half2float(half(a));
   float denom = __half2float(half(b));
@@ -510,7 +520,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline float16 operator-(const float16& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return float16(__hneg(half(a)));
 #else
   float16 res;
@@ -540,7 +550,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
 }
 
 HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __heq(half(a), half(b));
 #else
   return static_cast<float>(a) == static_cast<float>(b);
@@ -548,7 +558,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hne(half(a), half(b));
 #else
   return static_cast<float>(a) != static_cast<float>(b);
@@ -556,7 +566,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hlt(half(a), half(b));
 #else
   return static_cast<float>(a) < static_cast<float>(b);
@@ -564,7 +574,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hle(half(a), half(b));
 #else
   return static_cast<float>(a) <= static_cast<float>(b);
@@ -572,7 +582,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hgt(half(a), half(b));
 #else
   return static_cast<float>(a) > static_cast<float>(b);
@@ -580,7 +590,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
   return __hge(half(a), half(b));
 #else
   return static_cast<float>(a) >= static_cast<float>(b);
@@ -846,7 +856,7 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
 }
 
 HOSTDEVICE inline bool(isnan)(const float16& a) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))))
   return __hisnan(half(a));
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -936,31 +946,31 @@ struct numeric_limits<paddle::platform::float16> {
   static const bool traps = true;
   static const bool tinyness_before = false;
 
-  static paddle::platform::float16(min)() {
+  HOSTDEVICE static paddle::platform::float16(min)() {
     return paddle::platform::raw_uint16_to_float16(0x400);
   }
-  static paddle::platform::float16 lowest() {
+  HOSTDEVICE static paddle::platform::float16 lowest() {
     return paddle::platform::raw_uint16_to_float16(0xfbff);
   }
-  static paddle::platform::float16(max)() {
+  HOSTDEVICE static paddle::platform::float16(max)() {
     return paddle::platform::raw_uint16_to_float16(0x7bff);
   }
-  static paddle::platform::float16 epsilon() {
+  HOSTDEVICE static paddle::platform::float16 epsilon() {
     return paddle::platform::raw_uint16_to_float16(0x0800);
   }
-  static paddle::platform::float16 round_error() {
+  HOSTDEVICE static paddle::platform::float16 round_error() {
     return paddle::platform::float16(0.5);
   }
-  static paddle::platform::float16 infinity() {
+  HOSTDEVICE static paddle::platform::float16 infinity() {
     return paddle::platform::raw_uint16_to_float16(0x7c00);
   }
-  static paddle::platform::float16 quiet_NaN() {
+  HOSTDEVICE static paddle::platform::float16 quiet_NaN() {
     return paddle::platform::raw_uint16_to_float16(0x7e00);
   }
-  static paddle::platform::float16 signaling_NaN() {
+  HOSTDEVICE static paddle::platform::float16 signaling_NaN() {
     return paddle::platform::raw_uint16_to_float16(0x7e00);
   }
-  static paddle::platform::float16 denorm_min() {
+  HOSTDEVICE static paddle::platform::float16 denorm_min() {
     return paddle::platform::raw_uint16_to_float16(0x1);
   }
 };
diff --git a/paddle/fluid/platform/hostdevice.h b/paddle/fluid/platform/hostdevice.h
index c0dc92a5217640..1ffbbc217e254c 100644
--- a/paddle/fluid/platform/hostdevice.h
+++ b/paddle/fluid/platform/hostdevice.h
@@ -13,7 +13,11 @@
 // limitations under the License.
 #pragma once
 
-#ifdef __CUDACC__
+#ifdef __HIPCC__
+#include <hip/hip_runtime.h>
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index c25b692a4a0c70..bc1ab96528cc73 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -145,11 +145,11 @@ if(WITH_PYTHON)
     endif(WITH_MKLDNN)
   endif(WIN32)
 
-  if(WITH_AMD_GPU)
-    hip_library(paddle_pybind SHARED
+  if(WITH_ROCM_PLATFORM)
+    cc_library(paddle_pybind SHARED
       SRCS ${PYBIND_SRCS}
-      DEPS ARCHIVE_START ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END)
+      DEPS ${PYBIND_DEPS}
+      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
   else()
     cc_library(paddle_pybind SHARED
       SRCS ${PYBIND_SRCS}
@@ -158,7 +158,7 @@ if(WITH_PYTHON)
     if(NOT APPLE AND NOT WIN32)
       target_link_libraries(paddle_pybind rt)
     endif(NOT APPLE AND NOT WIN32)
-  endif(WITH_AMD_GPU)
+  endif(WITH_ROCM_PLATFORM)
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})

From c7f4a294059ace125404a42b2162167212f62e73 Mon Sep 17 00:00:00 2001
From: XUANBABY <XUANBABY_1993@163.com>
Date: Thu, 3 Dec 2020 17:57:13 +0800
Subject: [PATCH 02/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=80=E4=BA=9B?=
 =?UTF-8?q?=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/external/pybind11.cmake | 1 -
 cmake/generic.cmake           | 2 +-
 cmake/hip.cmake               | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 117c6cde11e65d..69bd68c2778497 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -39,7 +39,6 @@ ExternalProject_Add(
         # to be modified without triggering incremental compilation, and the
         # third-party library version changes cannot be incorporated.
         # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html
-#        UPDATE_COMMAND    ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
         INSTALL_COMMAND   ""
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 0cd41043fd5e95..be7adf4dceaeaa 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,7 +267,7 @@ function(merge_static_libs TARGET_NAME)
 endfunction(merge_static_libs)
 
 function(check_coverage_opt TARGET_NAME SRCS)
-  if(WITH_COVERAGE)
+  if(WITH_COVERAGE AND WITH_INCREMENTAL_COVERAGE)
     if ("$ENV{PADDLE_GIT_DIFF_H_FILE}" STREQUAL "")
       if (NOT ("$ENV{PADDLE_GIT_DIFF_CC_FILE}" STREQUAL ""))
         string(REPLACE "," ";" CC_FILE_LIST $ENV{PADDLE_GIT_DIFF_CC_FILE})
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 4d1a074e1abb9c..ac666ec686d163 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -11,7 +11,6 @@ include_directories("${ROCM_PATH}/hiprand/include")
 include_directories("${ROCM_PATH}/rocrand/include")
 include_directories("${ROCM_PATH}/rccl/include")
 
-#include_directories("${ROCM_PATH}/thrust")
 include_directories("${ROCM_PATH}/rocthrust/include/")
 include_directories("${ROCM_PATH}/hipcub/include/")
 include_directories("${ROCM_PATH}/rocprim/include/")

From 9ecc54cf7d16c64e64dba5635d795e250cc78774 Mon Sep 17 00:00:00 2001
From: XUANBABY <XUANBABY_1993@163.com>
Date: Fri, 4 Dec 2020 09:59:43 +0800
Subject: [PATCH 03/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=80=E4=BA=9B?=
 =?UTF-8?q?=E6=AD=A7=E4=B9=89=E5=B9=B6=E6=B7=BB=E5=8A=A0=E5=A4=87=E6=B3=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fluid/platform/dynload/dynamic_loader.cc   | 18 +++++++++---------
 paddle/fluid/platform/dynload/rocblas.h        |  2 ++
 paddle/fluid/platform/dynload/rocm_driver.h    |  1 +
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index dee7c001da8038..137e2ab0f0c2ea 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -49,19 +49,19 @@ DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
 #ifdef PADDLE_WITH_HIP
 
 DEFINE_string(miopen_dir, "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
+              "Specify path for loading libMIOpen.so. For instance, "
+              "/opt/rocm/miopen/lib. If empty [default], dlopen "
+              "will search miopen from LD_LIBRARY_PATH");
 
 DEFINE_string(rocm_dir, "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand, libcusolver. For instance, /usr/local/cuda/lib64. "
-              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+              "Specify path for loading rocm library, such as librocblas, "
+              "libcurand, libcusolver. For instance, /opt/rocm/lib. "
+              "If default, dlopen will search rocm from LD_LIBRARY_PATH");
 
 DEFINE_string(rccl_dir, "",
-              "Specify path for loading nccl library, such as libnccl.so. "
-              "For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
+              "Specify path for loading rccl library, such as librccl.so. "
+              "For instance, /opt/rocm/rccl/lib. If default, "
+              "dlopen will search rccl from LD_LIBRARY_PATH");
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h
index 858c3ae4d59d3e..c705396e90db96 100644
--- a/paddle/fluid/platform/dynload/rocblas.h
+++ b/paddle/fluid/platform/dynload/rocblas.h
@@ -64,6 +64,7 @@ extern void *rocblas_dso_handle;
   __macro(rocblas_dgemm);                \
   __macro(rocblas_hgemm);                   \
   __macro(rocblas_dgeam);                   \
+  /*rocblas_gemm_ex function not support at rocm3.5*/                                      \
   /*__macro(rocblas_gemm_ex);                 */\
   __macro(rocblas_sgemm_batched);            \
   __macro(rocblas_dgemm_batched);            \
@@ -94,6 +95,7 @@ ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
 #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
   __macro(rocblas_gemm_batched_ex);              \
+//rocm not support now(rocm3.5)
 //  __macro(rocblas_gemm_strided_batched_ex);
 
 ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h
index 7ab870b15f0157..37e1c2f0d61793 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -50,6 +50,7 @@ extern bool HasROCMDriver();
   __macro(hipModuleLoadData);                            \
   __macro(hipModuleGetFunction);                         \
   __macro(hipModuleUnload);                              \
+  /*rocm3.5 not support the function*/ \
  /* __macro(hipOccupancyMaxActiveBlocksPerMultiprocessor);*/ \
   __macro(hipModuleLaunchKernel);                              \
   __macro(hipLaunchKernel);                              \

From b3e4c16286e99093f41eb19d2d35be0984f427c2 Mon Sep 17 00:00:00 2001
From: xuanbaby <xuanbaby_1993@163.com>
Date: Mon, 7 Dec 2020 12:35:58 +0800
Subject: [PATCH 04/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BB=A3=E7=A0=81?=
 =?UTF-8?q?=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/fluid/platform/float16.h | 80 ++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 72c50619e10d16..753f0d398c2044 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -110,7 +110,9 @@ struct PADDLE_ALIGN(2) float16 {
 #endif
 
   HOSTDEVICE inline explicit float16(float val) {
-#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__))))
+#if ((defined(PADDLE_CUDA_FP16)) &&                       \
+     ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \
+      (defined(__HIP_DEVICE_COMPILE__))))
     half tmp = __float2half(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
 
@@ -256,7 +258,9 @@ struct PADDLE_ALIGN(2) float16 {
 #endif
 
   HOSTDEVICE inline explicit operator float() const {
-#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__))))
+#if (defined(PADDLE_CUDA_FP16) &&                         \
+     ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \
+      (defined(__HIP_DEVICE_COMPILE__))))
     half tmp = *reinterpret_cast<const half*>(this);
     return __half2float(tmp);
 
@@ -363,10 +367,11 @@ struct PADDLE_ALIGN(2) float16 {
 // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
 // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
 // CUDA 9.0 regarding the half data type.
-//xuan[TODO] change for rocm
+// xuan[TODO] change for rocm
 #if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
 DEVICE inline half operator+(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hadd(a, b);
 #else
   float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
@@ -375,7 +380,8 @@ DEVICE inline half operator+(const half& a, const half& b) {
 }
 
 DEVICE inline half operator-(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hsub(a, b);
 #else
   float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
@@ -384,7 +390,8 @@ DEVICE inline half operator-(const half& a, const half& b) {
 }
 
 DEVICE inline half operator*(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hmul(a, b);
 #else
   float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
@@ -393,7 +400,8 @@ DEVICE inline half operator*(const half& a, const half& b) {
 }
 
 DEVICE inline half operator/(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   float num = __half2float(a);
   float denom = __half2float(b);
   return __float2half(num / denom);
@@ -404,7 +412,8 @@ DEVICE inline half operator/(const half& a, const half& b) {
 }
 
 DEVICE inline half operator-(const half& a) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hneg(a);
 #else
   float res = -static_cast<float>(float16(a));
@@ -433,7 +442,8 @@ DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
 }
 
 DEVICE inline bool operator==(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __heq(a, b);
 #else
   return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
@@ -441,7 +451,8 @@ DEVICE inline bool operator==(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator!=(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hne(a, b);
 #else
   return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
@@ -449,7 +460,8 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator<(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hlt(a, b);
 #else
   return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
@@ -457,7 +469,8 @@ DEVICE inline bool operator<(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator<=(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hle(a, b);
 #else
   return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
@@ -465,7 +478,8 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator>(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hgt(a, b);
 #else
   return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
@@ -473,7 +487,8 @@ DEVICE inline bool operator>(const half& a, const half& b) {
 }
 
 DEVICE inline bool operator>=(const half& a, const half& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hge(a, b);
 #else
   return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
@@ -485,7 +500,8 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 // Arithmetic operators for float16 on GPU
 #if defined(PADDLE_CUDA_FP16)
 HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return float16(__hadd(half(a), half(b)));
 #else
   return float16(static_cast<float>(a) + static_cast<float>(b));
@@ -493,7 +509,8 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return float16(__hsub(half(a), half(b)));
 #else
   return float16(static_cast<float>(a) - static_cast<float>(b));
@@ -501,7 +518,8 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return float16(__hmul(half(a), half(b)));
 #else
   return float16(static_cast<float>(a) * static_cast<float>(b));
@@ -509,7 +527,8 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   // TODO(kexinzhao): check which cuda version starts to support __hdiv
   float num = __half2float(half(a));
   float denom = __half2float(half(b));
@@ -520,7 +539,8 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline float16 operator-(const float16& a) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return float16(__hneg(half(a)));
 #else
   float16 res;
@@ -550,7 +570,8 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
 }
 
 HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __heq(half(a), half(b));
 #else
   return static_cast<float>(a) == static_cast<float>(b);
@@ -558,7 +579,8 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hne(half(a), half(b));
 #else
   return static_cast<float>(a) != static_cast<float>(b);
@@ -566,7 +588,8 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hlt(half(a), half(b));
 #else
   return static_cast<float>(a) < static_cast<float>(b);
@@ -574,7 +597,8 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hle(half(a), half(b));
 #else
   return static_cast<float>(a) <= static_cast<float>(b);
@@ -582,7 +606,8 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hgt(half(a), half(b));
 #else
   return static_cast<float>(a) > static_cast<float>(b);
@@ -590,7 +615,8 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 }
 
 HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
-#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__)))
+#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+     (defined(__HIP_DEVICE_COMPILE__)))
   return __hge(half(a), half(b));
 #else
   return static_cast<float>(a) >= static_cast<float>(b);
@@ -856,7 +882,9 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
 }
 
 HOSTDEVICE inline bool(isnan)(const float16& a) {
-#if (defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || (defined(__HIP_DEVICE_COMPILE__))))
+#if (defined(PADDLE_CUDA_FP16) &&                         \
+     ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+      (defined(__HIP_DEVICE_COMPILE__))))
   return __hisnan(half(a));
 #else
   return (a.x & 0x7fff) > 0x7c00;

From ce2b25304592b7188de71e33412680fc3cbf21f1 Mon Sep 17 00:00:00 2001
From: xuanbaby <xuanbaby_1993@163.com>
Date: Mon, 7 Dec 2020 19:20:48 +0800
Subject: [PATCH 05/10] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=86=B2=E7=AA=81?=
 =?UTF-8?q?=E5=90=8E=E7=9A=84=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/operators.cmake | 87 +++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 40 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 764d96e189cbd1..0d9f6c648b3b75 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -1,10 +1,10 @@
-# CMake file `unity_build` is used to handle Unity Build compilation.
+#CMake file `unity_build` is used to handle Unity Build compilation.
 include(unity_build)
 set(PART_CUDA_KERNEL_FILES)
 function(op_library TARGET)
-    # op_library is a function to create op library. The interface is same as
-    # cc_library. But it handle split GPU/CPU code and link some common library
-    # for ops.
+#op_library is a function to create op library.The interface is same as
+#cc_library.But it handle split GPU / CPU code and link some common library
+#for ops.
     set(cc_srcs)
     set(cu_srcs)
     set(hip_cu_srcs)
@@ -19,7 +19,8 @@ function(op_library TARGET)
     set(mkldnn_cc_srcs)
     set(MKLDNN_FILE)
     set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
-    # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
+#Option `UNITY` is used to specify that operator `TARGET` will compiles with \
+    Unity Build.
     set(options UNITY)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
@@ -121,7 +122,8 @@ function(op_library TARGET)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
     if (WIN32)
-    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
+#remove windows unsupported op, because windows has no nccl, \
+    no warpctc such ops.
     foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
@@ -129,9 +131,10 @@ function(op_library TARGET)
     endforeach()
     endif(WIN32)
 
-    # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
+#Unity Build relies on global option `WITH_UNITY_BUILD` and local \
+    option `UNITY`.
     if(WITH_UNITY_BUILD AND op_library_UNITY)
-        # Generate the unity target name by the directory where source files located.
+#Generate the unity target name by the directory where source files located.
         string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR})
         string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET})
         set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity")
@@ -147,40 +150,42 @@ function(op_library TARGET)
         set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
     endif()
     if (WITH_GPU)
-        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
+#Unity Build relies on global option `WITH_UNITY_BUILD` and local \
+    option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
-            # Combine the cc and cu source files.
+#Combine the cc and cu source files.
             compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs})
             compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs} ${cu_srcs})
             if(TARGET ${UNITY_TARGET})
-                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
+#If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                 target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources} ${unity_target_cu_sources})
             else()
-                # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
+#If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
                 nv_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} ${unity_target_cu_sources} DEPS ${op_library_DEPS} ${op_common_deps})
             endif()
-            # Add alias library to handle dependencies.
+#Add alias library to handle dependencies.
             add_library(${TARGET} ALIAS ${UNITY_TARGET})
         else()
             nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
         endif()
-    elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+    elseif (WITH_ROCM_PLATFORM)
+	hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
-        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
+#Unity Build relies on global option `WITH_UNITY_BUILD` and local \
+    option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
-            # Combine the cc source files.
+#Combine the cc source files.
             compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
             if(TARGET ${UNITY_TARGET})
-                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
+#If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                 target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
             else()
-                # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
+#If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
                 cc_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} DEPS ${op_library_DEPS} ${op_common_deps})
             endif()
-            # Add alias library to handle dependencies.
+#Add alias library to handle dependencies.
             add_library(${TARGET} ALIAS ${UNITY_TARGET})
         else()
             cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
@@ -188,7 +193,7 @@ function(op_library TARGET)
         endif()
     endif()
 
-    # Define operators that don't need pybind here.
+#Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
@@ -200,12 +205,14 @@ function(op_library TARGET)
         endif()
     endforeach()
 
-    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
-    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
-    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+#The registration of USE_OP, \
+    please refer to paddle / fluid / framework / op_registry.h.
+#Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+#And for detail pybind information, \
+    please see generated paddle / pybind / pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    # [ \t\r\n]* is used for blank characters
+#[ \t\r\n] * is used for blank characters
     string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}")
 
     if (one_register STREQUAL "")
@@ -213,13 +220,13 @@ function(op_library TARGET)
     else ()
         string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
-        # [ \t\r\n]+ is used for blank characters.
-        # Here we use '+' instead of '*' since it is a REPLACE operation.
+#[ \t\r\n] + is used for blank characters.
+#Here we use '+' instead of '*' since it is a REPLACE operation.
         string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}")
     endif()
 
-    # pybind USE_NO_KERNEL_OP
-    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
+#pybind USE_NO_KERNEL_OP
+#HACK : if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
     string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
     string(REPLACE "_op" "" TARGET "${TARGET}")
     if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
@@ -227,7 +234,7 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
-    # pybind USE_CPU_ONLY_OP
+#pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
@@ -240,7 +247,7 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
-    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+#pybind USE_OP_DEVICE_KERNEL for CUDNN
     list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
     if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
       if(${TARGET} STREQUAL "activation")
@@ -250,13 +257,13 @@ function(op_library TARGET)
       endif()
     endif()
 
-    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+#pybind USE_OP_DEVICE_KERNEL for CUDNN
     list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
     if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
-    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+#pybind USE_OP_DEVICE_KERNEL for MIOPEN
     list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len)
     if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0)
       if(${TARGET} STREQUAL "activation")
@@ -265,8 +272,8 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
       endif()
     endif()
- 
-    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+
+#pybind USE_OP_DEVICE_KERNEL for MIOPEN
     list(LENGTH miopen_hip_cu_srcs miopen_hip_cu_srcs_len)
     if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
@@ -276,9 +283,9 @@ function(op_library TARGET)
     if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
-    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+#pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
-      # Append first implemented MKLDNN activation operator
+#Append first implemented MKLDNN activation operator
       if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
       elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
@@ -298,9 +305,9 @@ function(op_library TARGET)
       endif()
     endif()
 
-    # pybind USE_OP
+#pybind USE_OP
     if (${pybind_flag} EQUAL 0)
-      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+#NOTE(*) : activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
@@ -341,7 +348,7 @@ function(register_operators)
         endif()
     endforeach()
 
-    # Complete the processing of `UNITY_TARGET`.
+#Complete the processing of `UNITY_TARGET`.
     if(WITH_UNITY_BUILD)
         finish_unity_target(cc)
         if(WITH_GPU)

From 7add2748e22953aab02ef3cceeb098ab088f4309 Mon Sep 17 00:00:00 2001
From: xuanbaby <xuanbaby_1993@163.com>
Date: Tue, 8 Dec 2020 09:52:27 +0800
Subject: [PATCH 06/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9operators.cmake?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/operators.cmake | 93 ++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 55 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 0d9f6c648b3b75..92c99a2df5b5f8 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -1,26 +1,23 @@
-#CMake file `unity_build` is used to handle Unity Build compilation.
+# CMake file `unity_build` is used to handle Unity Build compilation.
 include(unity_build)
 set(PART_CUDA_KERNEL_FILES)
 function(op_library TARGET)
-#op_library is a function to create op library.The interface is same as
-#cc_library.But it handle split GPU / CPU code and link some common library
-#for ops.
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
     set(cc_srcs)
     set(cu_srcs)
     set(hip_cu_srcs)
-    set(hip_cu_cc_srcs)
+    set(miopen_hip_cc_srcs)
     set(cu_cc_srcs)
     set(xpu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(cudnn_cu_srcs)
-    set(miopen_hip_cu_cc_srcs)
-    set(miopen_hip_cu_srcs)
     set(CUDNN_FILE)
     set(mkldnn_cc_srcs)
     set(MKLDNN_FILE)
     set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
-#Option `UNITY` is used to specify that operator `TARGET` will compiles with \
-    Unity Build.
+    # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
     set(options UNITY)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
@@ -122,8 +119,7 @@ function(op_library TARGET)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
     if (WIN32)
-#remove windows unsupported op, because windows has no nccl, \
-    no warpctc such ops.
+    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
     foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
@@ -131,10 +127,9 @@ function(op_library TARGET)
     endforeach()
     endif(WIN32)
 
-#Unity Build relies on global option `WITH_UNITY_BUILD` and local \
-    option `UNITY`.
+    # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
     if(WITH_UNITY_BUILD AND op_library_UNITY)
-#Generate the unity target name by the directory where source files located.
+        # Generate the unity target name by the directory where source files located.
         string(REPLACE "${PADDLE_SOURCE_DIR}/paddle/fluid/" "" UNITY_TARGET ${CMAKE_CURRENT_SOURCE_DIR})
         string(REPLACE "/" "_" UNITY_TARGET ${UNITY_TARGET})
         set(UNITY_TARGET "paddle_${UNITY_TARGET}_unity")
@@ -150,42 +145,40 @@ function(op_library TARGET)
         set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
     endif()
     if (WITH_GPU)
-#Unity Build relies on global option `WITH_UNITY_BUILD` and local \
-    option `UNITY`.
+        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
-#Combine the cc and cu source files.
+            # Combine the cc and cu source files.
             compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs})
             compose_unity_target_sources(${UNITY_TARGET} cu ${cudnn_cu_srcs} ${cu_srcs})
             if(TARGET ${UNITY_TARGET})
-#If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
+                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                 target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources} ${unity_target_cu_sources})
             else()
-#If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
+                # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
                 nv_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} ${unity_target_cu_sources} DEPS ${op_library_DEPS} ${op_common_deps})
             endif()
-#Add alias library to handle dependencies.
+            # Add alias library to handle dependencies.
             add_library(${TARGET} ALIAS ${UNITY_TARGET})
         else()
             nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cudnn_cu_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
         endif()
     elseif (WITH_ROCM_PLATFORM)
-	hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+        hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS {op_library_DEPS}
                 ${op_common_deps})
     else()
-#Unity Build relies on global option `WITH_UNITY_BUILD` and local \
-    option `UNITY`.
+        # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
-#Combine the cc source files.
+            # Combine the cc source files.
             compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
             if(TARGET ${UNITY_TARGET})
-#If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
+                # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                 target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
             else()
-#If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
+                # If `UNITY_TARGET` does not exist, create `UNITY_TARGET` with source files.
                 cc_library(${UNITY_TARGET} SRCS ${unity_target_cc_sources} DEPS ${op_library_DEPS} ${op_common_deps})
             endif()
-#Add alias library to handle dependencies.
+            # Add alias library to handle dependencies.
             add_library(${TARGET} ALIAS ${UNITY_TARGET})
         else()
             cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
@@ -193,7 +186,7 @@ function(op_library TARGET)
         endif()
     endif()
 
-#Define operators that don't need pybind here.
+    # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
@@ -205,14 +198,12 @@ function(op_library TARGET)
         endif()
     endforeach()
 
-#The registration of USE_OP, \
-    please refer to paddle / fluid / framework / op_registry.h.
-#Note that it's enough to just adding one operator to pybind in a *_op.cc file.
-#And for detail pybind information, \
-    please see generated paddle / pybind / pybind.h.
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-#[ \t\r\n] * is used for blank characters
+    # [ \t\r\n]* is used for blank characters
     string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}")
 
     if (one_register STREQUAL "")
@@ -220,13 +211,13 @@ function(op_library TARGET)
     else ()
         string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
-#[ \t\r\n] + is used for blank characters.
-#Here we use '+' instead of '*' since it is a REPLACE operation.
+        # [ \t\r\n]+ is used for blank characters.
+        # Here we use '+' instead of '*' since it is a REPLACE operation.
         string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}")
     endif()
 
-#pybind USE_NO_KERNEL_OP
-#HACK : if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
+    # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
     string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
     string(REPLACE "_op" "" TARGET "${TARGET}")
     if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
@@ -234,7 +225,7 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
-#pybind USE_CPU_ONLY_OP
+    # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
@@ -247,7 +238,7 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
-#pybind USE_OP_DEVICE_KERNEL for CUDNN
+    # pybind USE_OP_DEVICE_KERNEL for CUDNN
     list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
     if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
       if(${TARGET} STREQUAL "activation")
@@ -256,14 +247,7 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
       endif()
     endif()
-
-#pybind USE_OP_DEVICE_KERNEL for CUDNN
-    list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
-    if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
-    endif()
-
-#pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
     list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len)
     if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0)
       if(${TARGET} STREQUAL "activation")
@@ -273,19 +257,18 @@ function(op_library TARGET)
       endif()
     endif()
 
-#pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
     list(LENGTH miopen_hip_cu_srcs miopen_hip_cu_srcs_len)
     if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_srcs_len} GREATER 0)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
-
     if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
-#pybind USE_OP_DEVICE_KERNEL for MKLDNN
+    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
-#Append first implemented MKLDNN activation operator
+      # Append first implemented MKLDNN activation operator
       if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
       elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
@@ -305,9 +288,9 @@ function(op_library TARGET)
       endif()
     endif()
 
-#pybind USE_OP
+    # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
-#NOTE(*) : activation use macro to regist the kernels, set use_op manually.
+      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
       if(${TARGET} STREQUAL "activation")
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
@@ -348,7 +331,7 @@ function(register_operators)
         endif()
     endforeach()
 
-#Complete the processing of `UNITY_TARGET`.
+    # Complete the processing of `UNITY_TARGET`.
     if(WITH_UNITY_BUILD)
         finish_unity_target(cc)
         if(WITH_GPU)

From 02af637060031b641fd52d022291ccb355654d75 Mon Sep 17 00:00:00 2001
From: xuanbaby <xuanbaby_1993@163.com>
Date: Tue, 8 Dec 2020 11:00:35 +0800
Subject: [PATCH 07/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/fluid/platform/dynload/hiprtc.h | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h
index 59724045e4bd6d..b29bf6274e31b4 100644
--- a/paddle/fluid/platform/dynload/hiprtc.h
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -27,21 +27,20 @@ extern std::once_flag hiprtc_dso_flag;
 extern void* hiprtc_dso_handle;
 extern bool HasHIPRTC();
 
-#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using hiprtc_func = decltype(&::__name);                              \
-      std::call_once(hiprtc_dso_flag, []() {                                \
+#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
+      using hiprtc_func = decltype(&::__name);                               \
+      std::call_once(hiprtc_dso_flag, []() {                                 \
         hiprtc_dso_handle = paddle::platform::dynload::GetHIPRTCDsoHandle(); \
-      });                                                                  \
-      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);          \
-      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);            \
-    }                                                                      \
-  };                                                                       \
+      });                                                                    \
+      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);           \
+      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);             \
+    }                                                                        \
+  };                                                                         \
   extern struct DynLoad__##__name __name
 
-
 /**
  * include all needed hiprtc functions
  **/
@@ -50,7 +49,7 @@ extern bool HasHIPRTC();
   __macro(hiprtcCompileProgram);     \
   __macro(hiprtcCreateProgram);      \
   __macro(hiprtcDestroyProgram);     \
-  __macro(hiprtcGetCode);             \
+  __macro(hiprtcGetCode);            \
   __macro(hiprtcGetCodeSize);        \
   __macro(hiprtcGetProgramLog);      \
   __macro(hiprtcGetProgramLogSize)

From 9c7ebc9ba2622a2c1cd45d43ee96b3d4ff3fc704 Mon Sep 17 00:00:00 2001
From: xuanbaby <xuanbaby_1993@163.com>
Date: Tue, 8 Dec 2020 13:45:39 +0800
Subject: [PATCH 08/10] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E9=94=99=E8=AF=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/operators.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 92c99a2df5b5f8..c69caa597239a0 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -247,6 +247,13 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
       endif()
     endif()
+
+    # pybind USE_OP_DEVICE_KERNEL for CUDNN
+    list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
+    if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
+    endif()
+
     # pybind USE_OP_DEVICE_KERNEL for MIOPEN
     list(LENGTH miopen_hip_cu_cc_srcs miopen_hip_cu_cc_srcs_len)
     if (WITH_ROCM_PLATFORM AND ${miopen_hip_cu_cc_srcs_len} GREATER 0)

From 02346a82e3361554bd1dc3f8fc21b377c405923f Mon Sep 17 00:00:00 2001
From: xuanbaby <xuanbaby_1993@163.com>
Date: Thu, 10 Dec 2020 14:20:23 +0800
Subject: [PATCH 09/10] =?UTF-8?q?=E7=BB=9F=E4=B8=80=E6=8E=A5=E5=8F=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/operators.cmake                         |  2 +-
 .../fluid/platform/dynload/dynamic_loader.cc  | 47 ++++------------
 .../fluid/platform/dynload/dynamic_loader.h   |  8 ---
 paddle/fluid/platform/dynload/hiprand.h       | 24 ++++----
 paddle/fluid/platform/dynload/hiprtc.cc       |  4 +-
 paddle/fluid/platform/dynload/hiprtc.h        | 28 +++++-----
 paddle/fluid/platform/dynload/miopen.cc       |  4 +-
 paddle/fluid/platform/dynload/miopen.h        | 44 +++++++--------
 paddle/fluid/platform/dynload/rccl.h          |  2 +-
 paddle/fluid/platform/dynload/rocblas.h       | 26 ++++-----
 paddle/fluid/platform/dynload/rocm_driver.cc  |  4 +-
 paddle/fluid/platform/dynload/rocm_driver.h   |  4 +-
 paddle/fluid/platform/dynload/rocrand.cc      | 30 ----------
 paddle/fluid/platform/dynload/rocrand.h       | 55 -------------------
 14 files changed, 83 insertions(+), 199 deletions(-)
 delete mode 100644 paddle/fluid/platform/dynload/rocrand.cc
 delete mode 100644 paddle/fluid/platform/dynload/rocrand.h

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index c69caa597239a0..824daf77519afe 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -164,7 +164,7 @@ function(op_library TARGET)
                 ${op_common_deps})
         endif()
     elseif (WITH_ROCM_PLATFORM)
-        hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS {op_library_DEPS}
+        hip_library_ops(${TARGET} SRCS ${cc_srcs} ${hip_cu_cc_srcs} ${hip_cu_srcs} ${miopen_hip_cu_cc_srcs} ${miopen_hip_cu_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 5a719862d24c83..303322a710a7d2 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -264,17 +264,13 @@ void* GetCublasDsoHandle() {
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true,
                                     {cuda_lib_path});
+#elif PADDLE_WITH_HIP
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
 }
 
-#ifdef PADDLE_WITH_HIP
-void* GetRocblasDsoHandle() {
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
-}
-#endif
-
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   std::string mac_warn_meg(
@@ -296,18 +292,14 @@ void* GetCUDNNDsoHandle() {
       "CUDNN version.");
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true,
                                     {cuda_lib_path}, win_warn_meg);
+#elif PADDLE_WITH_HIP
+  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
                                     {cuda_lib_path});
 #endif
 }
 
-#ifdef PADDLE_WITH_HIP
-void* GetMIOPENDsoHandle() {
-  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
-}
-#endif
-
 void* GetCUPTIDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false,
@@ -324,17 +316,13 @@ void* GetCurandDsoHandle() {
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true,
                                     {cuda_lib_path});
+#elif PADDLE_WITH_HIP
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
 }
 
-#ifdef PADDLE_WITH_HIP
-void* GetRocrandDsoHandle() {
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
-}
-#endif
-
 void* GetCusolverDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
@@ -349,31 +337,23 @@ void* GetCusolverDsoHandle() {
 void* GetNVRTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
+#elif PADDLE_WITH_HIP
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
 }
 
-#ifdef PADDLE_WITH_HIP
-void* GetHIPRTCDsoHandle() {
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so");
-}
-#endif
-
 void* GetCUDADsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
+#elif PADDLE_WITH_HIP
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
 #endif
 }
 
-#ifdef PADDLE_WITH_HIP
-void* GetROCMDsoHandle() {
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so");
-}
-#endif
-
 void* GetWarpCTCDsoHandle() {
   std::string warpctc_dir = "";
   if (!s_py_site_pkg_path.path.empty()) {
@@ -396,16 +376,13 @@ void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
                                     warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
                                     warning_msg);
 #endif
 }
-#if defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-void* GetRCCLDsoHandle() {
-  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
-}
-#endif
 
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 2211f7c034c03d..1136184ce1fc9a 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -38,14 +38,6 @@ void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
 
-#ifdef PADDLE_WITH_HIP
-void* GetRocblasDsoHandle();
-void* GetMIOPENDsoHandle();
-void* GetHiprandDsoHandle();
-void* GetHIPRTCDsoHandle();
-void* GetROCMDsoHandle();
-void* GetRCCLDsoHandle();
-#endif
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/hiprand.h b/paddle/fluid/platform/dynload/hiprand.h
index 298787a0436d0c..496e70bb26db68 100644
--- a/paddle/fluid/platform/dynload/hiprand.h
+++ b/paddle/fluid/platform/dynload/hiprand.h
@@ -26,18 +26,18 @@ namespace dynload {
 extern std::once_flag hiprand_dso_flag;
 extern void *hiprand_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                               \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    hiprandStatus_t operator()(Args... args) {                                 \
-      using hiprandFunc = decltype(&::__name);                                 \
-      std::call_once(hiprand_dso_flag, []() {                                  \
-        hiprand_dso_handle = paddle::platform::dynload::GetHiprandDsoHandle(); \
-      });                                                                      \
-      static void *p_##__name = dlsym(hiprand_dso_handle, #__name);            \
-      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);               \
-    }                                                                          \
-  };                                                                           \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                              \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    hiprandStatus_t operator()(Args... args) {                                \
+      using hiprandFunc = decltype(&::__name);                                \
+      std::call_once(hiprand_dso_flag, []() {                                 \
+        hiprand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
+      });                                                                     \
+      static void *p_##__name = dlsym(hiprand_dso_handle, #__name);           \
+      return reinterpret_cast<hiprandFunc>(p_##__name)(args...);              \
+    }                                                                         \
+  };                                                                          \
   extern DynLoad__##__name __name
 
 #define HIPRAND_RAND_ROUTINE_EACH(__macro)      \
diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc
index 0aa1f270879314..537597aaafd4b4 100644
--- a/paddle/fluid/platform/dynload/hiprtc.cc
+++ b/paddle/fluid/platform/dynload/hiprtc.cc
@@ -25,9 +25,9 @@ void* hiprtc_dso_handle = nullptr;
 
 HIPRTC_ROUTINE_EACH(DEFINE_WRAP);
 
-bool HasHIPRTC() {
+bool HasNVRTC() {
   std::call_once(hiprtc_dso_flag,
-                 []() { hiprtc_dso_handle = GetHIPRTCDsoHandle(); });
+                 []() { hiprtc_dso_handle = GetNVRTCDsoHandle(); });
   return hiprtc_dso_handle != nullptr;
 }
 
diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h
index b29bf6274e31b4..7cc58489fad9c5 100644
--- a/paddle/fluid/platform/dynload/hiprtc.h
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -25,20 +25,20 @@ namespace dynload {
 
 extern std::once_flag hiprtc_dso_flag;
 extern void* hiprtc_dso_handle;
-extern bool HasHIPRTC();
-
-#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
-      using hiprtc_func = decltype(&::__name);                               \
-      std::call_once(hiprtc_dso_flag, []() {                                 \
-        hiprtc_dso_handle = paddle::platform::dynload::GetHIPRTCDsoHandle(); \
-      });                                                                    \
-      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);           \
-      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);             \
-    }                                                                        \
-  };                                                                         \
+extern bool HasNVRTC();
+
+#define DECLARE_DYNAMIC_LOAD_HIPRTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
+      using hiprtc_func = decltype(&::__name);                              \
+      std::call_once(hiprtc_dso_flag, []() {                                \
+        hiprtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \
+      });                                                                   \
+      static void* p_##__name = dlsym(hiprtc_dso_handle, #__name);          \
+      return reinterpret_cast<hiprtc_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
   extern struct DynLoad__##__name __name
 
 /**
diff --git a/paddle/fluid/platform/dynload/miopen.cc b/paddle/fluid/platform/dynload/miopen.cc
index 00a1daf34fb89e..1b4bdd2939feb9 100644
--- a/paddle/fluid/platform/dynload/miopen.cc
+++ b/paddle/fluid/platform/dynload/miopen.cc
@@ -50,13 +50,13 @@ MIOPEN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
 
-bool HasMIOpen() {
+bool HasCUDNN() {
   std::call_once(miopen_dso_flag,
                  []() { miopen_dso_handle = GetCUDNNDsoHandle(); });
   return miopen_dso_handle != nullptr;
 }
 
-void EnforceMIOPENLoaded(const char* fn_name) {
+void EnforceCUDNNLoaded(const char* fn_name) {
   PADDLE_ENFORCE_NOT_NULL(
       miopen_dso_handle,
       platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index e4780c570b71b1..2de6429805c131 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -26,7 +26,7 @@ namespace dynload {
 
 extern std::once_flag miopen_dso_flag;
 extern void* miopen_dso_handle;
-extern bool HasMIOpen();
+extern bool HasCUDNN();
 
 inline const char* miopenGetErrorString(miopenStatus_t status) {
   switch (status) {
@@ -51,19 +51,19 @@ inline const char* miopenGetErrorString(miopenStatus_t status) {
 }
 
 extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                              \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {         \
-      using miopen_func = decltype(&::__name);                               \
-      std::call_once(miopen_dso_flag, []() {                                 \
-        miopen_dso_handle = paddle::platform::dynload::GetMIOPENDsoHandle(); \
-      });                                                                    \
-      EnforceCUDNNLoaded(#__name);                                           \
-      static void* p_##__name = dlsym(miopen_dso_handle, #__name);           \
-      return reinterpret_cast<miopen_func>(p_##__name)(args...);             \
-    }                                                                        \
-  };                                                                         \
+#define DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {        \
+      using miopen_func = decltype(&::__name);                              \
+      std::call_once(miopen_dso_flag, []() {                                \
+        miopen_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
+      });                                                                   \
+      EnforceCUDNNLoaded(#__name);                                          \
+      static void* p_##__name = dlsym(miopen_dso_handle, #__name);          \
+      return reinterpret_cast<miopen_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
   extern struct DynLoad__##__name __name
 
 /**
@@ -123,23 +123,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenDestroyDropoutDescriptor);                \
   __macro(miopenDestroyRNNDescriptor);
 
-MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 #define MIOPEN_DNN_ROUTINE_EACH_R2(__macro) \
   __macro(miopenConvolutionBackwardData);
-MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 // APIs available after R3:
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \
   __macro(miopenConvolutionBackwardWeightsGetWorkSpaceSize);
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 // APIs available after R4:
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
   __macro(miopenBatchNormalizationForwardTraining);  \
   __macro(miopenBatchNormalizationForwardInference); \
   __macro(miopenBatchNormalizationBackward);
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 // APIs in R5
 #define MIOPEN_DNN_ROUTINE_EACH_R5(__macro)  \
@@ -147,12 +147,12 @@ MIOPEN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
   __macro(miopenSetActivationDescriptor);    \
   __macro(miopenGetActivationDescriptor);    \
   __macro(miopenDestroyActivationDescriptor);
-MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 // APIs in R6
 #define MIOPEN_DNN_ROUTINE_EACH_R6(__macro) \
 /*__macro(miopenSetRNNDescriptor_v6);*/
-MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 #define MIOPEN_DNN_ROUTINE_EACH_R7(__macro) \
   __macro(miopenSetConvolutionGroupCount);  \
@@ -162,7 +162,7 @@ MIOPEN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
   __macro(miopenSetCTCLossDescriptor);      \
   __macro(miopenGetCTCLossWorkspaceSize);   \
   __macro(miopenCTCLoss);
-MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 
 #define MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                    \
 /*__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
@@ -170,7 +170,7 @@ __macro(cudnnBatchNormalizationForwardTrainingEx);                   \
 __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);          \
 __macro(cudnnBatchNormalizationBackwardEx);                          \
 __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);*/
-MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+MIOPEN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h
index ac8d111e8d26dd..1d61e330c248ff 100644
--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -32,7 +32,7 @@ extern void* rccl_dso_handle;
     auto operator()(Args... args) -> decltype(__name(args...)) {         \
       using nccl_func = decltype(&::__name);                             \
       std::call_once(rccl_dso_flag, []() {                               \
-        rccl_dso_handle = paddle::platform::dynload::GetRCCLDsoHandle(); \
+        rccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
       });                                                                \
       static void* p_##__name = dlsym(rccl_dso_handle, #__name);         \
       return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
diff --git a/paddle/fluid/platform/dynload/rocblas.h b/paddle/fluid/platform/dynload/rocblas.h
index 7d812c4db1bb0d..f78ed00ac63d03 100644
--- a/paddle/fluid/platform/dynload/rocblas.h
+++ b/paddle/fluid/platform/dynload/rocblas.h
@@ -36,19 +36,19 @@ extern void *rocblas_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                               \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
-      using rocblas_func =                                                     \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);            \
-      std::call_once(rocblas_dso_flag, []() {                                  \
-        rocblas_dso_handle = paddle::platform::dynload::GetRocblasDsoHandle(); \
-      });                                                                      \
-      static void *p_##__name = dlsym(rocblas_dso_handle, #__name);            \
-      return reinterpret_cast<rocblas_func>(p_##__name)(args...);              \
-    }                                                                          \
-  };                                                                           \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                              \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {   \
+      using rocblas_func =                                                    \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);           \
+      std::call_once(rocblas_dso_flag, []() {                                 \
+        rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
+      });                                                                     \
+      static void *p_##__name = dlsym(rocblas_dso_handle, #__name);           \
+      return reinterpret_cast<rocblas_func>(p_##__name)(args...);             \
+    }                                                                         \
+  };                                                                          \
   extern DynLoad__##__name __name
 
 #define ROCBLAS_BLAS_ROUTINE_EACH(__macro)            \
diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc
index c26e7f57ecaa2d..89ddd3018ffa69 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.cc
+++ b/paddle/fluid/platform/dynload/rocm_driver.cc
@@ -25,8 +25,8 @@ void* rocm_dso_handle = nullptr;
 
 ROCM_ROUTINE_EACH(DEFINE_WRAP);
 
-bool HasROCMDriver() {
-  std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetROCMDsoHandle(); });
+bool HasCUDADriver() {
+  std::call_once(rocm_dso_flag, []() { rocm_dso_handle = GetCUDADsoHandle(); });
   return rocm_dso_handle != nullptr;
 }
 
diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h
index 487eed27b40a29..dc9c18e732b0ba 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -26,7 +26,7 @@ namespace dynload {
 
 extern std::once_flag rocm_dso_flag;
 extern void* rocm_dso_handle;
-extern bool HasROCMDriver();
+extern bool HasCUDADriver();
 
 #define DECLARE_DYNAMIC_LOAD_ROCM_WRAP(__name)                           \
   struct DynLoad__##__name {                                             \
@@ -34,7 +34,7 @@ extern bool HasROCMDriver();
     auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {     \
       using rocm_func = decltype(&::__name);                             \
       std::call_once(rocm_dso_flag, []() {                               \
-        rocm_dso_handle = paddle::platform::dynload::GetROCMDsoHandle(); \
+        rocm_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \
       });                                                                \
       static void* p_##__name = dlsym(rocm_dso_handle, #__name);         \
       return reinterpret_cast<rocm_func>(p_##__name)(args...);           \
diff --git a/paddle/fluid/platform/dynload/rocrand.cc b/paddle/fluid/platform/dynload/rocrand.cc
deleted file mode 100644
index 8e14f88ff9c182..00000000000000
--- a/paddle/fluid/platform/dynload/rocrand.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/rocrand.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag curand_dso_flag;
-void *curand_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/rocrand.h b/paddle/fluid/platform/dynload/rocrand.h
deleted file mode 100644
index 0ff88855f244ef..00000000000000
--- a/paddle/fluid/platform/dynload/rocrand.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <rocrand.h>
-
-#include <mutex>  // NOLINT
-#include "paddle/fluid/platform/port.h"
-
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-extern std::once_flag curand_dso_flag;
-extern void *curand_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
-  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                              \
-    rocrand_status operator()(Args... args) {                                \
-      using curandFunc = decltype(&::__name);                                \
-      std::call_once(curand_dso_flag, []() {                                 \
-        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-      });                                                                    \
-      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
-    }                                                                        \
-  };                                                                         \
-  extern DynLoad__##__name __name
-
-#define CURAND_RAND_ROUTINE_EACH(__macro)   \
-  __macro(rocrand_create_generator);        \
-  __macro(rocrand_set_stream);              \
-  __macro(rocrand_generate_uniform);        \
-  __macro(rocrand_generate_uniform_double); \
-  __macro(rocrand_generate_normal);         \
-  __macro(rocrand_destroy_generator);
-
-CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle

From c58044ff782cb3d6c6646f0650946910fa81def3 Mon Sep 17 00:00:00 2001
From: xuanbaby <xuanbaby_1993@163.com>
Date: Mon, 14 Dec 2020 11:41:52 +0800
Subject: [PATCH 10/10] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=97=A5=E6=9C=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/fluid/platform/dynload/hiprtc.cc      | 2 +-
 paddle/fluid/platform/dynload/rocm_driver.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/dynload/hiprtc.cc b/paddle/fluid/platform/dynload/hiprtc.cc
index 537597aaafd4b4..86a39d08eaa520 100644
--- a/paddle/fluid/platform/dynload/hiprtc.cc
+++ b/paddle/fluid/platform/dynload/hiprtc.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/platform/dynload/rocm_driver.cc b/paddle/fluid/platform/dynload/rocm_driver.cc
index 89ddd3018ffa69..9ec123b632ffa4 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.cc
+++ b/paddle/fluid/platform/dynload/rocm_driver.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.