Skip to content

Commit

Permalink
[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913)
Browse files Browse the repository at this point in the history
  • Loading branch information
qili93 authored Feb 8, 2021
1 parent 15297a0 commit 93c1d9e
Show file tree
Hide file tree
Showing 19 changed files with 750 additions and 110 deletions.
59 changes: 43 additions & 16 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,12 @@ ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
IF(WITH_GPU)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()
IF(WITH_ROCM)
hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()

cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
Expand All @@ -72,7 +77,7 @@ IF(WITH_DGC)
set(dgc_deps dgc)
ENDIF()

IF(WITH_GPU)
IF(WITH_GPU OR WITH_ROCM)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF()

Expand All @@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
ELSE()
set(MKLDNN_CTX_DEPS)
ENDIF()

nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
IF(WITH_GPU)
nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()
IF(WITH_ROCM)
hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()

IF(WITH_GPU OR WITH_ROCM)
set(STREAM_CALLBACK_DEPS stream_callback_manager)
ELSE()
set(STREAM_CALLBACK_DEPS)
Expand All @@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool

cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)

if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
target_link_libraries(device_context cuda_resource_pool)
endif()

nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)

cc_test(init_test SRCS init_test.cc DEPS device_context)

nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
if(WITH_GPU)
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
endif()

if(WITH_ROCM)
hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
endif()

cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer)
Expand All @@ -127,25 +145,34 @@ if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else()
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif()

cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)

cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)

nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
IF(WITH_GPU)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF()

nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
IF(WITH_ROCM)
hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF()

if(NOT APPLE AND NOT WIN32)
cc_library(device_code SRCS device_code.cc DEPS device_context)
if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
endif()
endif()
5 changes: 3 additions & 2 deletions paddle/fluid/platform/collective_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
// limitations under the License.

#include "paddle/fluid/platform/collective_helper.h"
#include <utility>

namespace paddle {
namespace platform {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class NCCLCommImpl : public NCCLComm {
public:
void set_ring_id(int ring_id) { ring_id_ = ring_id; }
Expand All @@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
void set_comm(ncclComm_t comm) { comm_ = comm; }
ncclComm_t comm() const override { return comm_; }

cudaStream_t stream() const override { return dev_ctx_->stream(); }
gpuStream_t stream() const override { return dev_ctx_->stream(); }

void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
dev_ctx_ = std::move(dev_ctx);
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/collective_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
namespace paddle {
namespace platform {

#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// In order to apply hierarchical communication with NCCL, we need
// a communication ring contains NCCL communicators associated to a global
// ncclUniqueId. E.g. for a hierarchical case,
Expand Down Expand Up @@ -56,7 +56,7 @@ class NCCLComm {
virtual int rank() const = 0;
virtual int device_id() const = 0;
virtual ncclComm_t comm() const = 0;
virtual cudaStream_t stream() const = 0;
virtual gpuStream_t stream() const = 0;
virtual CUDADeviceContext* dev_context() const = 0;
virtual ~NCCLComm() = default;
};
Expand Down
39 changes: 27 additions & 12 deletions paddle/fluid/platform/cuda_device_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,26 @@ limitations under the License. */

#pragma once

#include <cuda.h>
// NOTE(): support float16 to half in header file.
#define PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle {
namespace platform {

#ifdef PADDLE_WITH_HIP
#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
#else
#if CUDA_VERSION < 9000
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif

inline static int RoundToPowerOfTwo(int dim) {
if (dim > 512) {
Expand Down Expand Up @@ -67,7 +69,7 @@ template <typename T>
__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
int delta,
int width = warpSize) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl_down(val, delta, width);
#else
return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
Expand All @@ -77,26 +79,35 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
template <typename T>
__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
int width = warpSize) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl_xor(val, width);
#else
return __shfl_xor_sync(mask, val, width);
#endif
}

// CUDA 9.0 have native compatible float16 shfl_down
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta,
int width) {
#ifdef PADDLE_WITH_HIP
return float16(__shfl_down(static_cast<float>(val),
static_cast<unsigned>(delta), width));
#else
return float16(
__shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
#endif
}
template <>
__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
float16 val, int width) {
#ifdef PADDLE_WITH_HIP
return float16(__shfl_xor(static_cast<float>(val), width));
#else
return float16(__shfl_xor(static_cast<half>(val), width));
#endif
}
#else
template <>
Expand Down Expand Up @@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
template <typename T>
__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
int width = 32) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl(val, src_line, width);
#else
return __shfl_sync(mask, val, src_line, width);
Expand All @@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {

template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
#ifdef PADDLE_WITH_HIP
const int warpSize = 64;
#else
const int warpSize = 32;
#endif
__shared__ T shm[warpSize];
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
Expand Down
22 changes: 21 additions & 1 deletion paddle/fluid/platform/cuda_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,16 @@

#include <mutex> // NOLINT

#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rocblas.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"

#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
#endif

Expand Down Expand Up @@ -77,6 +82,12 @@ namespace platform {

class CublasHandleHolder {
public:
#ifdef PADDLE_WITH_HIP
explicit CublasHandleHolder(hipStream_t stream) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
}
#else
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
Expand All @@ -92,9 +103,14 @@ class CublasHandleHolder {
}
#endif // CUDA_VERSION >= 9000
}
#endif

~CublasHandleHolder() PADDLE_MAY_THROW {
#ifdef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
#else
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
#endif
}

template <typename Callback>
Expand All @@ -106,7 +122,11 @@ class CublasHandleHolder {
private:
DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);

#ifdef PADDLE_WITH_HIP
rocblas_handle handle_;
#else
cublasHandle_t handle_;
#endif
mutable std::mutex mtx_;
};

Expand Down
Loading

0 comments on commit 93c1d9e

Please sign in to comment.