PaddlePaddle · qili93 · Feb 8, 2021 · Feb 6, 2021
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
@@ -52,7 +52,12 @@ ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+IF(WITH_GPU)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+ENDIF()
+IF(WITH_ROCM)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+ENDIF()
 
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
@@ -72,7 +77,7 @@ IF(WITH_DGC)
     set(dgc_deps dgc)
 ENDIF()
 
-IF(WITH_GPU)
+IF(WITH_GPU OR WITH_ROCM)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()
 
@@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
 ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
-
-nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
 IF(WITH_GPU)
+    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+ENDIF()
+IF(WITH_ROCM)
+    hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
+ENDIF()
+
+IF(WITH_GPU OR WITH_ROCM)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
 ELSE()
   set(STREAM_CALLBACK_DEPS)
@@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
 
-if(WITH_GPU)
+if(WITH_GPU OR WITH_ROCM)
     cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
     target_link_libraries(device_context cuda_resource_pool)
 endif()
 
-nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
-nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+if(WITH_GPU)
+  nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
+  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+  nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
+  nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+endif()
+
+if(WITH_ROCM)
+  hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
+  hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
+  hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
+  hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+endif()
 
 cc_library(timer SRCS timer.cc)
 cc_test(timer_test SRCS timer_test.cc DEPS timer)
@@ -127,25 +145,34 @@ if(WITH_GPU)
   nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
+elseif(WITH_ROCM)
+  hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
+  hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
+  hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
   cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
   cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
 endif()
 
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-
-nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
-
 cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
 
-nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+IF(WITH_GPU)
+  nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+  nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+ENDIF()
 
-nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+IF(WITH_ROCM)
+  hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+  hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+ENDIF()
 
 if(NOT APPLE AND NOT WIN32)
   cc_library(device_code SRCS device_code.cc DEPS device_context)
-  if(WITH_GPU)
+  if(WITH_GPU OR WITH_ROCM)
     cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
   endif()
 endif()
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/collective_helper.h"
+#include <utility>
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class NCCLCommImpl : public NCCLComm {
  public:
   void set_ring_id(int ring_id) { ring_id_ = ring_id; }
@@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
   void set_comm(ncclComm_t comm) { comm_ = comm; }
   ncclComm_t comm() const override { return comm_; }
 
-  cudaStream_t stream() const override { return dev_ctx_->stream(); }
+  gpuStream_t stream() const override { return dev_ctx_->stream(); }
 
   void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
     dev_ctx_ = std::move(dev_ctx);

diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
@@ -27,7 +27,7 @@
 namespace paddle {
 namespace platform {
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 // In order to apply hierarchical communication with NCCL, we need
 // a communication ring contains NCCL communicators associated to a global
 // ncclUniqueId. E.g. for a hierarchical case,
@@ -56,7 +56,7 @@ class NCCLComm {
   virtual int rank() const = 0;
   virtual int device_id() const = 0;
   virtual ncclComm_t comm() const = 0;
-  virtual cudaStream_t stream() const = 0;
+  virtual gpuStream_t stream() const = 0;
   virtual CUDADeviceContext* dev_context() const = 0;
   virtual ~NCCLComm() = default;
 };

diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
@@ -14,24 +14,26 @@ limitations under the License. */
 
 #pragma once
 
-#include <cuda.h>
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
 
+#ifdef PADDLE_WITH_HIP
+#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
+#else
 #if CUDA_VERSION < 9000
 #define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
 #else
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
+#endif
 
 inline static int RoundToPowerOfTwo(int dim) {
   if (dim > 512) {
@@ -67,7 +69,7 @@ template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta,
                                                  int width = warpSize) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
   return __shfl_down(val, delta, width);
 #else
   return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
@@ -77,26 +79,35 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 template <typename T>
 __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
                                                 int width = warpSize) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
   return __shfl_xor(val, width);
 #else
   return __shfl_xor_sync(mask, val, width);
 #endif
 }
 
 // CUDA 9.0 have native compatible float16 shfl_down
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
                                                        int width) {
+#ifdef PADDLE_WITH_HIP
+  return float16(__shfl_down(static_cast<float>(val),
+                             static_cast<unsigned>(delta), width));
+#else
   return float16(
       __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
+#endif
 }
 template <>
 __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
                                                       float16 val, int width) {
+#ifdef PADDLE_WITH_HIP
+  return float16(__shfl_xor(static_cast<float>(val), width));
+#else
   return float16(__shfl_xor(static_cast<half>(val), width));
+#endif
 }
 #else
 template <>
@@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
   return __shfl(val, src_line, width);
 #else
   return __shfl_sync(mask, val, src_line, width);
@@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {
 
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
+// NOTE(zcd): The warp size should be taken from the
+// parameters of the GPU but not specified as 32 simply.
+// To make the reduceSum more efficiently,
+// I use Warp-Level Parallelism and assume the Warp size
+// is 32 which may be different for different GPU,
+// but most card's warp size is 32.
+#ifdef PADDLE_WITH_HIP
+  const int warpSize = 64;
+#else
   const int warpSize = 32;
+#endif
   __shared__ T shm[warpSize];
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, tid < len);

diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
@@ -16,11 +16,16 @@
 
 #include <mutex>  // NOLINT
 
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
+#endif
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/rocblas.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
 enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
 #endif
 
@@ -77,6 +82,12 @@ namespace platform {
 
 class CublasHandleHolder {
  public:
+#ifdef PADDLE_WITH_HIP
+  explicit CublasHandleHolder(hipStream_t stream) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
+  }
+#else
   CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
@@ -92,9 +103,14 @@ class CublasHandleHolder {
     }
 #endif  // CUDA_VERSION >= 9000
   }
+#endif
 
   ~CublasHandleHolder() PADDLE_MAY_THROW {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
+#else
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
+#endif
   }
 
   template <typename Callback>
@@ -106,7 +122,11 @@ class CublasHandleHolder {
  private:
   DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
 
+#ifdef PADDLE_WITH_HIP
+  rocblas_handle handle_;
+#else
   cublasHandle_t handle_;
+#endif
   mutable std::mutex mtx_;
 };