Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCM] update fluid platform for rocm39 (part3), test=develop #30913

Merged
merged 1 commit into from
Feb 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 43 additions & 16 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,12 @@ ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
IF(WITH_GPU)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()
IF(WITH_ROCM)
hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
ENDIF()

cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
Expand All @@ -72,7 +77,7 @@ IF(WITH_DGC)
set(dgc_deps dgc)
ENDIF()

IF(WITH_GPU)
IF(WITH_GPU OR WITH_ROCM)
set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
ENDIF()

Expand All @@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
ELSE()
set(MKLDNN_CTX_DEPS)
ENDIF()

nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
IF(WITH_GPU)
nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()
IF(WITH_ROCM)
hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
ENDIF()

IF(WITH_GPU OR WITH_ROCM)
set(STREAM_CALLBACK_DEPS stream_callback_manager)
ELSE()
set(STREAM_CALLBACK_DEPS)
Expand All @@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool

cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)

if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
target_link_libraries(device_context cuda_resource_pool)
endif()

nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)

cc_test(init_test SRCS init_test.cc DEPS device_context)

nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
if(WITH_GPU)
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
endif()

if(WITH_ROCM)
hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
endif()

cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer)
Expand All @@ -127,25 +145,34 @@ if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else()
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif()

cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)

cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)

nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
IF(WITH_GPU)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF()

nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
IF(WITH_ROCM)
hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
ENDIF()

if(NOT APPLE AND NOT WIN32)
cc_library(device_code SRCS device_code.cc DEPS device_context)
if(WITH_GPU)
if(WITH_GPU OR WITH_ROCM)
cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
endif()
endif()
5 changes: 3 additions & 2 deletions paddle/fluid/platform/collective_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
// limitations under the License.

#include "paddle/fluid/platform/collective_helper.h"
#include <utility>

namespace paddle {
namespace platform {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class NCCLCommImpl : public NCCLComm {
public:
void set_ring_id(int ring_id) { ring_id_ = ring_id; }
Expand All @@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
void set_comm(ncclComm_t comm) { comm_ = comm; }
ncclComm_t comm() const override { return comm_; }

cudaStream_t stream() const override { return dev_ctx_->stream(); }
gpuStream_t stream() const override { return dev_ctx_->stream(); }

void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
dev_ctx_ = std::move(dev_ctx);
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/collective_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
namespace paddle {
namespace platform {

#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// In order to apply hierarchical communication with NCCL, we need
// a communication ring contains NCCL communicators associated to a global
// ncclUniqueId. E.g. for a hierarchical case,
Expand Down Expand Up @@ -56,7 +56,7 @@ class NCCLComm {
virtual int rank() const = 0;
virtual int device_id() const = 0;
virtual ncclComm_t comm() const = 0;
virtual cudaStream_t stream() const = 0;
virtual gpuStream_t stream() const = 0;
virtual CUDADeviceContext* dev_context() const = 0;
virtual ~NCCLComm() = default;
};
Expand Down
39 changes: 27 additions & 12 deletions paddle/fluid/platform/cuda_device_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,26 @@ limitations under the License. */

#pragma once

#include <cuda.h>
// NOTE(): support float16 to half in header file.
#define PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle {
namespace platform {

#ifdef PADDLE_WITH_HIP
#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
#else
#if CUDA_VERSION < 9000
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
#endif

inline static int RoundToPowerOfTwo(int dim) {
if (dim > 512) {
Expand Down Expand Up @@ -67,7 +69,7 @@ template <typename T>
__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
int delta,
int width = warpSize) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl_down(val, delta, width);
#else
return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
Expand All @@ -77,26 +79,35 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
template <typename T>
__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
int width = warpSize) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl_xor(val, width);
#else
return __shfl_xor_sync(mask, val, width);
#endif
}

// CUDA 9.0 have native compatible float16 shfl_down
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
template <>
__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
float16 val, int delta,
int width) {
#ifdef PADDLE_WITH_HIP
return float16(__shfl_down(static_cast<float>(val),
static_cast<unsigned>(delta), width));
#else
return float16(
__shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
#endif
}
template <>
__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
float16 val, int width) {
#ifdef PADDLE_WITH_HIP
return float16(__shfl_xor(static_cast<float>(val), width));
#else
return float16(__shfl_xor(static_cast<half>(val), width));
#endif
}
#else
template <>
Expand Down Expand Up @@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
template <typename T>
__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
int width = 32) {
#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
return __shfl(val, src_line, width);
#else
return __shfl_sync(mask, val, src_line, width);
Expand All @@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {

template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
// NOTE(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
#ifdef PADDLE_WITH_HIP
const int warpSize = 64;
#else
const int warpSize = 32;
#endif
__shared__ T shm[warpSize];
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
Expand Down
22 changes: 21 additions & 1 deletion paddle/fluid/platform/cuda_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,16 @@

#include <mutex> // NOLINT

#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h"
#endif
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/rocblas.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"

#if CUDA_VERSION < 9000
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
#endif

Expand Down Expand Up @@ -77,6 +82,12 @@ namespace platform {

class CublasHandleHolder {
public:
#ifdef PADDLE_WITH_HIP
explicit CublasHandleHolder(hipStream_t stream) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
}
#else
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
Expand All @@ -92,9 +103,14 @@ class CublasHandleHolder {
}
#endif // CUDA_VERSION >= 9000
}
#endif

~CublasHandleHolder() PADDLE_MAY_THROW {
#ifdef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
#else
PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
#endif
}

template <typename Callback>
Expand All @@ -106,7 +122,11 @@ class CublasHandleHolder {
private:
DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);

#ifdef PADDLE_WITH_HIP
rocblas_handle handle_;
#else
cublasHandle_t handle_;
#endif
mutable std::mutex mtx_;
};

Expand Down
Loading