diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 410889cfb9e7e..47344f0e3733d 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -52,7 +52,12 @@ ENDIF() cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) -nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) +IF(WITH_GPU) + nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) +ENDIF() +IF(WITH_ROCM) + hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) +ENDIF() cc_library(place SRCS place.cc DEPS enforce boost) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) @@ -72,7 +77,7 @@ IF(WITH_DGC) set(dgc_deps dgc) ENDIF() -IF(WITH_GPU) +IF(WITH_GPU OR WITH_ROCM) set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) ENDIF() @@ -81,9 +86,14 @@ IF(WITH_MKLDNN) ELSE() set(MKLDNN_CTX_DEPS) ENDIF() - -nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) IF(WITH_GPU) + nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) +ENDIF() +IF(WITH_ROCM) + hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) +ENDIF() + +IF(WITH_GPU OR WITH_ROCM) set(STREAM_CALLBACK_DEPS stream_callback_manager) ELSE() set(STREAM_CALLBACK_DEPS) @@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) -if(WITH_GPU) +if(WITH_GPU OR WITH_ROCM) cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info) target_link_libraries(device_context cuda_resource_pool) endif() -nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) - cc_test(init_test SRCS init_test.cc DEPS device_context) -nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) -nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) -nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) +if(WITH_GPU) + nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) + nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) + nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) + nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) +endif() + +if(WITH_ROCM) + hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) + hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda) + hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor) + hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context) +endif() cc_library(timer SRCS timer.cc) cc_test(timer_test SRCS timer_test.cc DEPS timer) @@ -127,25 +145,34 @@ if(WITH_GPU) nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda) nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) +elseif(WITH_ROCM) + hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) + hip_test(cuda_helper_test SRCS cuda_helper_test.cu) + hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) endif() cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) - -nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) - cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) -nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) +IF(WITH_GPU) + nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) + nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) + nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) +ENDIF() -nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) +IF(WITH_ROCM) + hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) + hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) + hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) +ENDIF() if(NOT APPLE AND NOT WIN32) cc_library(device_code SRCS device_code.cc DEPS device_context) - if(WITH_GPU) + if(WITH_GPU OR WITH_ROCM) cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor) endif() endif() diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 0ef3a18448544..4b16a67b235fd 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -13,10 +13,11 @@ // limitations under the License. #include "paddle/fluid/platform/collective_helper.h" +#include namespace paddle { namespace platform { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class NCCLCommImpl : public NCCLComm { public: void set_ring_id(int ring_id) { ring_id_ = ring_id; } @@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm { void set_comm(ncclComm_t comm) { comm_ = comm; } ncclComm_t comm() const override { return comm_; } - cudaStream_t stream() const override { return dev_ctx_->stream(); } + gpuStream_t stream() const override { return dev_ctx_->stream(); } void set_dev_ctx(std::unique_ptr&& dev_ctx) { dev_ctx_ = std::move(dev_ctx); diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index 0cd501da428bc..8a6719ab685b8 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -27,7 +27,7 @@ namespace paddle { namespace platform { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // In order to apply hierarchical communication with NCCL, we need // a communication ring contains NCCL communicators associated to a global // ncclUniqueId. E.g. for a hierarchical case, @@ -56,7 +56,7 @@ class NCCLComm { virtual int rank() const = 0; virtual int device_id() const = 0; virtual ncclComm_t comm() const = 0; - virtual cudaStream_t stream() const = 0; + virtual gpuStream_t stream() const = 0; virtual CUDADeviceContext* dev_context() const = 0; virtual ~NCCLComm() = default; }; diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index a70050bae113d..4f504b414de4a 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -14,10 +14,8 @@ limitations under the License. */ #pragma once -#include // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 -#include #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" @@ -25,6 +23,9 @@ limitations under the License. */ namespace paddle { namespace platform { +#ifdef PADDLE_WITH_HIP +#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) +#else #if CUDA_VERSION < 9000 #define CREATE_SHFL_MASK(mask, predicate) mask = 0u; #else @@ -32,6 +33,7 @@ namespace platform { #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) #endif +#endif inline static int RoundToPowerOfTwo(int dim) { if (dim > 512) { @@ -67,7 +69,7 @@ template __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { -#if CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000 return __shfl_down(val, delta, width); #else return __shfl_down_sync(mask, val, static_cast(delta), width); @@ -77,7 +79,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, template __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, int width = warpSize) { -#if CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000 return __shfl_xor(val, width); #else return __shfl_xor_sync(mask, val, width); @@ -85,18 +87,27 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, } // CUDA 9.0 have native compatible float16 shfl_down -#if CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000 template <> __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { +#ifdef PADDLE_WITH_HIP + return float16(__shfl_down(static_cast(val), + static_cast(delta), width)); +#else return float16( __shfl_down(static_cast(val), static_cast(delta), width)); +#endif } template <> __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, float16 val, int width) { +#ifdef PADDLE_WITH_HIP + return float16(__shfl_xor(static_cast(val), width)); +#else return float16(__shfl_xor(static_cast(val), width)); +#endif } #else template <> @@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync( template __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { -#if CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000 return __shfl(val, src_line, width); #else return __shfl_sync(mask, val, src_line, width); @@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() { template __device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. +// NOTE(zcd): The warp size should be taken from the +// parameters of the GPU but not specified as 32 simply. +// To make the reduceSum more efficiently, +// I use Warp-Level Parallelism and assume the Warp size +// is 32 which may be different for different GPU, +// but most card's warp size is 32. +#ifdef PADDLE_WITH_HIP + const int warpSize = 64; +#else const int warpSize = 32; +#endif __shared__ T shm[warpSize]; unsigned mask = 0u; CREATE_SHFL_MASK(mask, tid < len); diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index 9357d5db17cd1..ef0e3a72d1a67 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -16,11 +16,16 @@ #include // NOLINT +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/cublas.h" +#endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/rocblas.h" +#endif #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" -#if CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000 enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; #endif @@ -77,6 +82,12 @@ namespace platform { class CublasHandleHolder { public: +#ifdef PADDLE_WITH_HIP + explicit CublasHandleHolder(hipStream_t stream) { + PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_)); + PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream)); + } +#else CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream)); @@ -92,9 +103,14 @@ class CublasHandleHolder { } #endif // CUDA_VERSION >= 9000 } +#endif ~CublasHandleHolder() PADDLE_MAY_THROW { +#ifdef PADDLE_WITH_HIP + PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_)); +#else PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_)); +#endif } template @@ -106,7 +122,11 @@ class CublasHandleHolder { private: DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); +#ifdef PADDLE_WITH_HIP + rocblas_handle handle_; +#else cublasHandle_t handle_; +#endif mutable std::mutex mtx_; }; diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index 044f4d6748e3a..fd46aa2393403 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -47,8 +47,13 @@ void TestCase(size_t num) { T *in1, *in2, *out; T *d_in1, *d_in2; size_t size = sizeof(T) * num; +#ifdef PADDLE_WITH_HIP + hipMalloc(reinterpret_cast(&d_in1), size); + hipMalloc(reinterpret_cast(&d_in2), size); +#else cudaMalloc(reinterpret_cast(&d_in1), size); cudaMalloc(reinterpret_cast(&d_in2), size); +#endif in1 = reinterpret_cast(malloc(size)); in2 = reinterpret_cast(malloc(size)); out = reinterpret_cast(malloc(size)); @@ -58,12 +63,22 @@ void TestCase(size_t num) { in1[i] = static_cast(dist(engine)); in2[i] = static_cast(dist(engine)); } +#ifdef PADDLE_WITH_HIP + hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); + hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); + hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel), dim3(1), + dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num); + hipDeviceSynchronize(); + hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost); + hipDeviceSynchronize(); +#else cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); AddKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); cudaDeviceSynchronize(); cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); +#endif for (size_t i = 0; i < num; ++i) { // NOTE(dzhwinter): the float16 add has small underflow/overflow // so we use EXPECT_NEAR to check the result. @@ -73,8 +88,13 @@ void TestCase(size_t num) { free(in1); free(in2); free(out); +#ifdef PADDLE_WITH_HIP + hipFree(d_in1); + hipFree(d_in2); +#else cudaFree(d_in1); cudaFree(d_in2); +#endif } // cuda primitives @@ -103,8 +123,13 @@ void TestUnalign(size_t num, const int shift_bit) { size_t size = sizeof(uint8_t) * (num + shift_bit); size_t array_size = sizeof(float16) * (num / 2); +#ifdef PADDLE_WITH_HIP + hipMalloc(reinterpret_cast(&d_in1), size); + hipMalloc(reinterpret_cast(&d_in2), size); +#else cudaMalloc(reinterpret_cast(&d_in1), size); cudaMalloc(reinterpret_cast(&d_in2), size); +#endif in1 = reinterpret_cast(malloc(size)); in2 = reinterpret_cast(malloc(size)); out = reinterpret_cast(malloc(size)); @@ -121,12 +146,23 @@ void TestUnalign(size_t num, const int shift_bit) { r_in1[i] = static_cast(dist(engine)); r_in2[i] = static_cast(dist(engine)); } +#ifdef PADDLE_WITH_HIP + hipMemcpy(d_in1, r_in1, array_size, hipMemcpyHostToDevice); + hipMemcpy(d_in2, r_in2, array_size, hipMemcpyHostToDevice); + hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel), dim3(1), + dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, + num / 2); + hipDeviceSynchronize(); + hipMemcpy(out, d_in2, array_size, hipMemcpyDeviceToHost); + hipDeviceSynchronize(); +#else cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice); AddKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2); cudaDeviceSynchronize(); cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); +#endif for (size_t i = 0; i < num / 2; ++i) { // NOTE(dzhwinter): the float16 add has small truncate error. // so we use EXPECT_NEAR to check the result. @@ -137,8 +173,13 @@ void TestUnalign(size_t num, const int shift_bit) { free(in1); free(in2); free(out); +#ifdef PADDLE_WITH_HIP + hipFree(d_in1); + hipFree(d_in2); +#else cudaFree(d_in1); cudaFree(d_in2); +#endif } TEST(CudaAtomic, float16Unalign) { @@ -203,8 +244,13 @@ void TestReduce(size_t num, float atol = 0.01) { T* in1; T *d_in1, *d_in2; size_t size = sizeof(T) * num; +#ifdef PADDLE_WITH_HIP + hipMalloc(reinterpret_cast(&d_in1), size); + hipMalloc(reinterpret_cast(&d_in2), sizeof(T)); +#else cudaMalloc(reinterpret_cast(&d_in1), size); cudaMalloc(reinterpret_cast(&d_in2), sizeof(T)); +#endif in1 = reinterpret_cast(malloc(size)); std::minstd_rand engine; std::uniform_real_distribution dist(0.0, 1.0); @@ -212,17 +258,31 @@ void TestReduce(size_t num, float atol = 0.01) { in1[i] = static_cast(dist(engine)); } auto out = std::accumulate(in1, in1 + num, static_cast(0)); +#ifdef PADDLE_WITH_HIP + hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); + hipDeviceSynchronize(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(DeviceReduceSum), dim3(1), + dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num); + hipMemcpy(in1, d_in2, sizeof(T), hipMemcpyDeviceToHost); + hipDeviceSynchronize(); +#else cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); DeviceReduceSum<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); +#endif // NOTE(dzhwinter): the float16 add has small underflow/overflow // so we use EXPECT_NEAR to check the result. EXPECT_NEAR(static_cast(in1[0]), static_cast(out), atol); free(in1); +#ifdef PADDLE_WITH_HIP + hipFree(d_in1); + hipFree(d_in2); +#else cudaFree(d_in1); cudaFree(d_in2); +#endif } TEST(CudaShuffleSync, float16) { diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 72430a3f75323..340372007a77b 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -13,7 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_CUDA #include +#endif +#ifdef PADDLE_WITH_HIP +#include +#endif #include #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" @@ -50,7 +55,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) { static_cast(val)); // NOLINT } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) USE_CUDA_ATOMIC(Add, double); #else CUDA_ATOMIC_WRAPPER(Add, double) { @@ -149,12 +154,12 @@ USE_CUDA_ATOMIC(Max, int); USE_CUDA_ATOMIC(Max, unsigned int); // CUDA API uses unsigned long long int, we cannot use uint64_t here. // It because unsigned long long int is not necessarily uint64_t -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT #else CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT if (*address >= val) { - return; + return *address; } unsigned long long int old = *address, assumed; // NOLINT @@ -181,7 +186,7 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) { CUDA_ATOMIC_WRAPPER(Max, float) { if (*address >= val) { - return; + return *address; } int *const address_as_i = reinterpret_cast(address); @@ -199,7 +204,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) { CUDA_ATOMIC_WRAPPER(Max, double) { if (*address >= val) { - return; + return *address; } unsigned long long int *const address_as_ull = // NOLINT @@ -221,12 +226,12 @@ USE_CUDA_ATOMIC(Min, int); USE_CUDA_ATOMIC(Min, unsigned int); // CUDA API uses unsigned long long int, we cannot use uint64_t here. // It because unsigned long long int is not necessarily uint64_t -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT #else CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT if (*address <= val) { - return; + return *address; } unsigned long long int old = *address, assumed; // NOLINT @@ -253,7 +258,7 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) { CUDA_ATOMIC_WRAPPER(Min, float) { if (*address <= val) { - return; + return *address; } int *const address_as_i = reinterpret_cast(address); @@ -271,7 +276,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) { CUDA_ATOMIC_WRAPPER(Min, double) { if (*address <= val) { - return; + return *address; } unsigned long long int *const address_as_ull = // NOLINT diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/cuda_resource_pool.cc index 6ecb312d72072..70d2ec5505798 100644 --- a/paddle/fluid/platform/cuda_resource_pool.cc +++ b/paddle/fluid/platform/cuda_resource_pool.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_resource_pool.h" #include "paddle/fluid/platform/gpu_info.h" @@ -25,15 +25,24 @@ CudaStreamResourcePool::CudaStreamResourcePool() { for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { auto creator = [dev_idx] { platform::SetDeviceId(dev_idx); - cudaStream_t stream; + gpuStream_t stream; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); +#endif return stream; }; - auto deleter = [dev_idx](cudaStream_t stream) { + auto deleter = [dev_idx](gpuStream_t stream) { platform::SetDeviceId(dev_idx); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream)); +#endif }; pool_.emplace_back( @@ -65,15 +74,24 @@ CudaEventResourcePool::CudaEventResourcePool() { for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { auto creator = [dev_idx] { platform::SetDeviceId(dev_idx); - cudaEvent_t event; + gpuEvent_t event; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventCreateWithFlags(&event, hipEventDisableTiming)); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); +#endif return event; }; - auto deleter = [dev_idx](cudaEvent_t event) { + auto deleter = [dev_idx](gpuEvent_t event) { platform::SetDeviceId(dev_idx); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); +#endif }; pool_.emplace_back(ResourcePool::Create(creator, deleter)); diff --git a/paddle/fluid/platform/cuda_resource_pool.h b/paddle/fluid/platform/cuda_resource_pool.h index 570b68b08fc1e..2ac13e692f783 100644 --- a/paddle/fluid/platform/cuda_resource_pool.h +++ b/paddle/fluid/platform/cuda_resource_pool.h @@ -14,9 +14,17 @@ #pragma once +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + #ifdef PADDLE_WITH_CUDA #include #include +#endif + +#ifdef PADDLE_WITH_HIP +#include +#endif + #include #include #include @@ -26,8 +34,8 @@ namespace paddle { namespace platform { -using CudaStreamObject = std::remove_pointer::type; -using CudaEventObject = std::remove_pointer::type; +using CudaStreamObject = std::remove_pointer::type; +using CudaEventObject = std::remove_pointer::type; class CudaStreamResourcePool { public: diff --git a/paddle/fluid/platform/cudnn_desc_test.cc b/paddle/fluid/platform/cudnn_desc_test.cc index 0adbc7e4af267..db5362f5cb1f5 100644 --- a/paddle/fluid/platform/cudnn_desc_test.cc +++ b/paddle/fluid/platform/cudnn_desc_test.cc @@ -12,7 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/miopen_desc.h" +#else #include "paddle/fluid/platform/cudnn_desc.h" +#endif #include diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc index 0975d990b473a..a4226dabf9d52 100644 --- a/paddle/fluid/platform/device_code.cc +++ b/paddle/fluid/platform/device_code.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/device_code.h" - #include +#include #include +#include +#include "paddle/fluid/platform/device_code.h" #include "paddle/fluid/platform/enforce.h" DECLARE_string(cuda_dir); @@ -71,26 +72,35 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { } for (auto& p : set) { if (is_gpu_place(p)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) device_codes_.emplace(p, DeviceCodeMap()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "CUDAPlace is not supported, please re-compile with WITH_GPU=ON.")); + "CUDAPlace or HIPPlace is not supported, please re-compile with " + "WITH_GPU=ON or WITH_ROCM=ON.")); #endif } } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) CUDADeviceCode::CheckAvailableStatus(); #endif } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_HIP +static bool CheckCUDADriverResult(hipError_t result, std::string caller, + std::string kernel_name = "") { + if (result != hipSuccess) { + const char* error = nullptr; + error = dynload::hipGetErrorString(result); +#else static bool CheckCUDADriverResult(CUresult result, std::string caller, std::string kernel_name = "") { if (result != CUDA_SUCCESS) { const char* error = nullptr; dynload::cuGetErrorString(result, &error); +#endif LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name << " > failed: " << error << " (" << result << ")"; return false; @@ -109,13 +119,23 @@ void CUDADeviceCode::CheckAvailableStatus() { int nvrtc_major = 0; int nvrtc_minor = 0; +#ifdef PADDLE_WITH_HIP + hiprtcResult nvrtc_result = + dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor); +#else nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); +#endif int driver_version = 0; int dirver_major = 0; int driver_minor = 0; +#ifdef PADDLE_WITH_HIP + hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version); + if (driver_result == hipSuccess) { +#else CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); if (driver_result == CUDA_SUCCESS) { +#endif dirver_major = driver_version / 1000; driver_minor = (driver_version % 1000) / 10; } @@ -123,13 +143,22 @@ void CUDADeviceCode::CheckAvailableStatus() { LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "." << driver_minor << "; NVRTC Version: " << nvrtc_major << "." << nvrtc_minor; +#ifdef PADDLE_WITH_HIP + if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) { +#else if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) { +#endif return; } int count = 0; +#ifdef PADDLE_WITH_HIP + if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count), + "hipGetDeviceCount")) { +#else if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), "cuDeviceGetCount")) { +#endif available_ = true; } } @@ -163,14 +192,20 @@ static std::string FindCUDAIncludePath() { } } +#ifdef PADDLE_WITH_HIP + cuda_include_path = "/opt/rocm/include"; +#else cuda_include_path = "/usr/local/cuda/include"; +#endif + if (stat(cuda_include_path.c_str(), &st) == 0) { return cuda_include_path; } - LOG(WARNING) << "Cannot find CUDA include path." - << "Please check whether CUDA is installed in the default " - "installation path, or specify it by export " - "FLAGS_cuda_dir=xxx."; + LOG(WARNING) + << "Cannot find CUDA or ROCM include path." + << "Please check whether CUDA or ROCM is installed in the default " + "installation path, or specify it by export " + "FLAGS_cuda_dir=xxx."; return ""; } @@ -183,7 +218,11 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name, place_ = place; name_ = name; +#ifdef PADDLE_WITH_HIP + kernel_ = "#include \n" + kernel; +#else kernel_ = kernel; +#endif } bool CUDADeviceCode::Compile(bool include_path) { @@ -193,7 +232,84 @@ bool CUDADeviceCode::Compile(bool include_path) { << "NVRTC and CUDA driver are need for JIT compiling of CUDA code."; return false; } +#ifdef PADDLE_WITH_HIP + hiprtcProgram program; + if (!CheckNVRTCResult(dynload::hiprtcCreateProgram(&program, + kernel_.c_str(), // buffer + name_.c_str(), // name + 0, // numHeaders + nullptr, // headers + nullptr), // includeNames + "hiprtcCreateProgram")) { + return false; + } + // Compile the program for specified compute_capability + auto* dev_ctx = reinterpret_cast( + DeviceContextPool::Instance().Get(place_)); + int compute_capability = dev_ctx->GetComputeCapability(); + std::vector options = {"-std=c++11", "--amdgpu-target=gfx906"}; + std::string include_option; + if (include_path) { + std::string cuda_include_path = FindCUDAIncludePath(); + if (!cuda_include_path.empty()) { + include_option = "--include-path=" + cuda_include_path; + options.push_back(include_option.c_str()); + } + } + hiprtcResult compile_result = + dynload::hiprtcCompileProgram(program, // program + options.size(), // numOptions + options.data()); // options + if (compile_result == HIPRTC_ERROR_COMPILATION) { + // Obtain compilation log from the program + size_t log_size; + if (!CheckNVRTCResult(dynload::hiprtcGetProgramLogSize(program, &log_size), + "hiprtcGetProgramLogSize")) { + return false; + } + std::vector log; + log.resize(log_size + 1); + if (!CheckNVRTCResult(dynload::hiprtcGetProgramLog(program, log.data()), + "hiprtcGetProgramLog")) { + return false; + } + LOG(WARNING) << "JIT compiling of ROCM GPU code failed:" + << "\n Kernel name: " << name_ << "\n Kernel body:\n" + << kernel_ << "\n Compiling log: " << log.data(); + + return false; + } + + // Obtain PTX from the program for cuda + // Obtain Code from the program for hip + size_t ptx_size; + if (!CheckNVRTCResult(dynload::hiprtcGetCodeSize(program, &ptx_size), + "hiprtcGetCodeSize")) { + return false; + } + ptx_.resize(ptx_size + 1); + if (!CheckNVRTCResult(dynload::hiprtcGetCode(program, ptx_.data()), + "hiprtcGetCode")) { + return false; + } + + if (!CheckNVRTCResult(dynload::hiprtcDestroyProgram(&program), + "hiprtcDestroyProgram")) { + return false; + } + + if (!CheckCUDADriverResult(dynload::hipModuleLoadData(&module_, ptx_.data()), + "hipModuleLoadData")) { + return false; + } + + if (!CheckCUDADriverResult( + dynload::hipModuleGetFunction(&function_, module_, name_.c_str()), + "hipModuleGetFunction")) { + return false; + } +#else nvrtcProgram program; if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program, kernel_.c_str(), // buffer @@ -271,6 +387,7 @@ bool CUDADeviceCode::Compile(bool include_path) { "cuModuleGetFunction", name_)) { return false; } +#endif max_threads_ = dev_ctx->GetMaxPhysicalThreadCount(); is_compiled_ = true; @@ -291,6 +408,18 @@ void CUDADeviceCode::Launch(const size_t n, std::vector* args) const { auto* dev_ctx = reinterpret_cast( DeviceContextPool::Instance().Get(place_)); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_EQ( + dynload::hipModuleLaunchKernel(function_, num_blocks, 1, 1, // grid dim + num_threads_, 1, 1, // block dim + 0, // shared memory + dev_ctx->stream(), // stream + args->data(), // arguments + nullptr), + hipSuccess, + errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)", + name_.c_str())); +#else PADDLE_ENFORCE_EQ( dynload::cuLaunchKernel(function_, num_blocks, 1, 1, // grid dim num_threads_, 1, 1, // block dim @@ -301,8 +430,19 @@ void CUDADeviceCode::Launch(const size_t n, std::vector* args) const { CUDA_SUCCESS, errors::External("Fail to launch kernel %s (in cuLaunchKernel.)", name_.c_str())); +#endif } +#ifdef PADDLE_WITH_HIP +bool CUDADeviceCode::CheckNVRTCResult(hiprtcResult result, + std::string function) { + if (result != HIPRTC_SUCCESS) { + LOG_FIRST_N(WARNING, 1) + << "Call " << function << " for < " << name_ + << " > failed: " << dynload::hiprtcGetErrorString(result); + return false; + } +#else bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { if (result != NVRTC_SUCCESS) { @@ -311,6 +451,7 @@ bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result, << " > failed: " << dynload::nvrtcGetErrorString(result); return false; } +#endif return true; } #endif diff --git a/paddle/fluid/platform/device_code.h b/paddle/fluid/platform/device_code.h index 4199317a8ceb0..6b1c284abbd7e 100644 --- a/paddle/fluid/platform/device_code.h +++ b/paddle/fluid/platform/device_code.h @@ -25,6 +25,10 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/cuda_driver.h" #include "paddle/fluid/platform/dynload/nvrtc.h" #endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/hiprtc.h" +#include "paddle/fluid/platform/dynload/rocm_driver.h" +#endif namespace paddle { namespace platform { @@ -44,7 +48,7 @@ class DeviceCode { std::string kernel_; }; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class CUDADeviceCode : public DeviceCode { public: explicit CUDADeviceCode(const Place& place, const std::string& name, @@ -61,7 +65,11 @@ class CUDADeviceCode : public DeviceCode { static bool IsAvailable() { return available_; } private: +#ifdef PADDLE_WITH_HIP + bool CheckNVRTCResult(hiprtcResult result, std::string function); +#else bool CheckNVRTCResult(nvrtcResult result, std::string function); +#endif static bool available_; @@ -70,8 +78,13 @@ class CUDADeviceCode : public DeviceCode { int num_threads_{1024}; int workload_per_thread_{1}; std::vector ptx_; +#ifdef PADDLE_WITH_HIP + hipModule_t module_; + hipFunction_t function_; +#else CUmodule module_; CUfunction function_; +#endif }; #endif diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc index bb4fceb85de0a..aadfffb59133b 100644 --- a/paddle/fluid/platform/device_code_test.cc +++ b/paddle/fluid/platform/device_code_test.cc @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_code.h" +#include #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/init.h" +#ifdef PADDLE_WITH_CUDA constexpr auto saxpy_code = R"( extern "C" __global__ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) { @@ -26,8 +28,22 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) { } } )"; +#endif -#ifdef PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_HIP +constexpr auto saxpy_code = R"( +#include +extern "C" __global__ +void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) { + for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n; + tid += blockDim.x * gridDim.x) { + z[tid] = a * x[tid] + y[tid]; + } +} +)"; +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(DeviceCode, cuda) { if (!paddle::platform::dynload::HasNVRTC() || !paddle::platform::dynload::HasCUDADriver()) { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a04214c701465..c5fb46833f760 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -12,7 +12,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -29,7 +29,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { } if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto* default_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto& desired_dev_ctx = @@ -65,7 +65,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { namespace paddle { namespace platform { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool allow_tf32_cublas = true; void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; } bool AllowTF32Cublas() { return allow_tf32_cublas; } @@ -122,7 +122,7 @@ DeviceContextPool::DeviceContextPool( EmplaceDeviceContext(&device_contexts_, p); #endif } else if (platform::is_gpu_place(p)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) EmplaceDeviceContext(&device_contexts_, p); #else PADDLE_THROW( @@ -130,7 +130,7 @@ DeviceContextPool::DeviceContextPool( "re-compile with WITH_GPU option.")); #endif } else if (platform::is_cuda_pinned_place(p)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) EmplaceDeviceContext( &device_contexts_, p); #else @@ -229,7 +229,7 @@ Place XPUDeviceContext::GetPlace() const { return place_; } xpu::Context* XPUDeviceContext::x_context() const { return context_; } #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class EigenCudaStreamDevice : public Eigen::StreamInterface { public: @@ -238,15 +238,19 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } ~EigenCudaStreamDevice() override {} - void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) { + void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) { stream_ = cuda_stream; place_ = place; device_prop_ = &Eigen::m_deviceProperties[place.device]; } - const cudaStream_t& stream() const override { return *stream_; } + const gpuStream_t& stream() const override { return *stream_; } +#ifdef PADDLE_WITH_HIP + const hipDeviceProp_t& deviceProperties() const override { +#else const cudaDeviceProp& deviceProperties() const override { +#endif return *device_prop_; } @@ -295,16 +299,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { char* scratch = static_cast(scratchpad()) + Eigen::kGpuScratchSize; #endif semaphore_ = reinterpret_cast(scratch); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); +#endif } return semaphore_; } private: CUDAPlace place_; - const cudaStream_t* stream_; // not owned; + const gpuStream_t* stream_; // not owned; +#ifdef PADDLE_WITH_HIP + const hipDeviceProp_t* device_prop_; +#else const cudaDeviceProp* device_prop_; // not owned; +#endif mutable void* scratch_; mutable unsigned int* semaphore_; mutable std::mutex mtx_; // to protect allocations_ @@ -339,14 +352,18 @@ CUDAContext::CUDAContext(const CUDAPlace& place, InitEigenContext(); InitCuBlasContext(); InitCuDNNContext(); +#ifndef PADDLE_WITH_HIP InitCuSolverContext(); +#endif } CUDAContext::~CUDAContext() { CUDADeviceGuard guard(place_.device); DestoryCuDNNContext(); DestoryCuBlasContext(); +#ifndef PADDLE_WITH_HIP DestoryCuSolverContext(); +#endif } CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { @@ -369,17 +386,29 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { << ", Runtime API Version: " << runtime_version_ / 1000 << "." << (runtime_version_ % 100) / 10; +#ifdef PADDLE_WITH_HIP + size_t version_major, version_minor, version_patch; + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion( + &version_major, &version_minor, &version_patch)); + LOG_FIRST_N(WARNING, 1) << "device: " << place_.device + << ", MIOpen Version: " << version_major << "." + << version_minor << "." << version_patch; +#else size_t cudnn_dso_ver = dynload::cudnnGetVersion(); LOG_FIRST_N(WARNING, 1) << "device: " << place_.device << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." << (cudnn_dso_ver % 1000) / 100 << "."; - +#endif { // Check CUDA/CUDNN version compatiblity auto local_cuda_version = (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10; +#ifdef PADDLE_WITH_HIP + auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10); +#else auto compile_cuda_version = (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10; +#endif if (local_cuda_version < compile_cuda_version) { LOG_FIRST_N(WARNING, 1) << "WARNING: device: " << place_.device @@ -397,7 +426,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nccl_comm_) { PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); } @@ -434,7 +463,11 @@ dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const { return max_grid_dim_size_; } +#ifdef PADDLE_WITH_HIP +miopenHandle_t CUDADeviceContext::cudnn_handle() const { +#else cudnnHandle_t CUDADeviceContext::cudnn_handle() const { +#endif return context()->CudnnHandle(); } @@ -442,13 +475,13 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_); } +#ifndef PADDLE_WITH_HIP cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const { return context()->CusolverDnHandle(); } +#endif -cudaStream_t CUDADeviceContext::stream() const { - return context()->RawStream(); -} +gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); } CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e37a5e18e0136..72138b7909117 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -30,6 +30,16 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/cuda_helper.h" // NOLINT +#include "paddle/fluid/platform/dynload/miopen.h" +#include "paddle/fluid/platform/dynload/rocblas.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/dynload/rccl.h" +#endif +#include "paddle/fluid/platform/gpu_info.h" // NOLINT +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -44,7 +54,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/stream/cuda_stream.h" #endif #include "unsupported/Eigen/CXX11/Tensor" @@ -62,7 +72,7 @@ struct GpuDevice; namespace paddle { namespace platform { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /*Set the value of the global variable allow_tf32_cublas*/ void SetAllowTF32Cublas(bool active); /*Get the global variable allow_tf32_cublas value*/ @@ -153,7 +163,7 @@ struct DefaultDeviceContextType { }; #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class CudnnWorkspaceHandle; class EigenCudaStreamDevice; @@ -179,13 +189,19 @@ class CUDAContext { const std::unique_ptr& Stream() const { return stream_; } - const cudaStream_t& RawStream() { return stream_->raw_stream(); } + const gpuStream_t& RawStream() { return stream_->raw_stream(); } +#ifdef PADDLE_WITH_HIP + const miopenHandle_t& CudnnHandle() const { return cudnn_handle_; } +#else const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; } +#endif +#ifndef PADDLE_WITH_HIP const cusolverDnHandle_t& CusolverDnHandle() const { return cusolver_dn_handle_; } +#endif const std::unique_ptr& CublasHandle() const { return cublas_handle_; @@ -222,6 +238,11 @@ class CUDAContext { private: void InitEigenContext(); +#ifdef PADDLE_WITH_HIP + void InitCuBlasContext() { + cublas_handle_.reset(new CublasHandleHolder(RawStream())); + } +#else void InitCuBlasContext() { cublas_handle_.reset( new CublasHandleHolder(RawStream(), CUBLAS_DEFAULT_MATH)); @@ -236,9 +257,32 @@ class CUDAContext { #endif // CUDA_VERSION >= 9000 } } +#endif void InitCuDNNContext() { if (dynload::HasCUDNN()) { +#ifdef PADDLE_WITH_HIP + size_t miopen_major, miopen_minor, miopen_patch; + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion( + &miopen_major, &miopen_minor, &miopen_patch)); + auto local_miopen_version = + (miopen_major * 1000 + miopen_minor * 100 + miopen_patch) / 100; + auto compile_miopen_version = MIOPEN_VERSION / 100; + if (local_miopen_version < static_cast(compile_miopen_version)) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with MIOPEN " + << compile_miopen_version / 10 << "." << compile_miopen_version % 10 + << ", but MIOPEN version in your machine is " + << local_miopen_version / 10 << "." << local_miopen_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible MIOPEN " + "version."; + } + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreate(&cudnn_handle_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::miopenSetStream(cudnn_handle_, RawStream())); +#else auto local_cudnn_version = dynload::cudnnGetVersion() / 100; auto compile_cudnn_version = CUDNN_VERSION / 100; if (local_cudnn_version < static_cast(compile_cudnn_version)) { @@ -255,20 +299,27 @@ class CUDAContext { PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_RETRY_CUDA_SUCCESS( dynload::cudnnSetStream(cudnn_handle_, RawStream())); +#endif } else { cudnn_handle_ = nullptr; } } +#ifndef PADDLE_WITH_HIP void InitCuSolverContext() { PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_)); PADDLE_RETRY_CUDA_SUCCESS( dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream())); } +#endif void DestoryCuDNNContext() { if (cudnn_handle_) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroy(cudnn_handle_)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_)); +#endif } cudnn_handle_ = nullptr; } @@ -279,22 +330,30 @@ class CUDAContext { cublas_tf32_tensor_core_handle_.reset(); } +#ifndef PADDLE_WITH_HIP void DestoryCuSolverContext() { if (cusolver_dn_handle_) { PADDLE_ENFORCE_CUDA_SUCCESS( dynload::cusolverDnDestroy(cusolver_dn_handle_)); } } +#endif CUDAPlace place_; std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; std::unique_ptr stream_; +#ifdef PADDLE_WITH_HIP + miopenHandle_t cudnn_handle_; +#else cudnnHandle_t cudnn_handle_; +#endif std::unique_ptr cublas_handle_; std::unique_ptr cublas_tensor_core_handle_; std::unique_ptr cublas_tf32_tensor_core_handle_; +#ifndef PADDLE_WITH_HIP cusolverDnHandle_t cusolver_dn_handle_; +#endif DISABLE_COPY_AND_ASSIGN(CUDAContext); }; @@ -343,8 +402,12 @@ class CUDADeviceContext : public DeviceContext { return context()->TensorCoreCublasCallIfAvailable(callback); } - /*! \brief Return cudnn handle in the device context. */ +/*! \brief Return cudnn handle in the device context. */ +#ifdef PADDLE_WITH_HIP + miopenHandle_t cudnn_handle() const; +#else cudnnHandle_t cudnn_handle() const; +#endif /*! \brief Return a cudnn workspace handle to call multiple cudnn * functions without interrupting by other threads. @@ -355,12 +418,14 @@ class CUDADeviceContext : public DeviceContext { * sequential cudnn function calls. */ CudnnWorkspaceHandle cudnn_workspace_handle() const; +#ifndef PADDLE_WITH_HIP cusolverDnHandle_t cusolver_dn_handle() const; +#endif /*! \brief Return cuda stream in the device context. */ - cudaStream_t stream() const; + gpuStream_t stream() const; -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) /*! \brief Return nccl communicators. */ ncclComm_t nccl_comm() const { return nccl_comm_; } @@ -369,7 +434,7 @@ class CUDADeviceContext : public DeviceContext { #endif template - void RecordEvent(cudaEvent_t ev, Callback callback) const { + void RecordEvent(gpuEvent_t ev, Callback callback) const { return context()->Stream()->RecordEvent(ev, callback); } @@ -411,7 +476,7 @@ class CUDADeviceContext : public DeviceContext { mutable std::mutex cudnn_handle_mtx_; -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) // NCCL communicator (single process version) for NCCL collective operations. // NCCL collective operations provides fast collectives over multiple GPUs // both within and across nodes. diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 5b3aa98efb46b..857d5d2765160 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -41,7 +41,11 @@ TEST(Device, CUDADeviceContext) { CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); +#ifdef PADDLE_WITH_HIP + miopenHandle_t cudnn_handle = device_context->cudnn_handle(); +#else cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); +#endif ASSERT_NE(nullptr, cudnn_handle); delete device_context; } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index d873ac619f347..47ade89ff2df3 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -42,8 +42,7 @@ limitations under the License. */ #include #include #include -#include // NOLINT -#include "paddle/fluid/platform/cuda_error.pb.h" // NOLINT +#include // NOLINT #endif #include @@ -1034,11 +1033,6 @@ inline void retry_sleep(unsigned milliseconds) { inline bool is_error(hipError_t e) { return e != hipSuccess; } inline std::string build_rocm_error_msg(hipError_t e) { -#if defined(PADDLE_WITH_HIP) - int32_t cuda_version = 100; -#else - int32_t cuda_version = -1; -#endif std::ostringstream sout; sout << " Hip error(" << e << "), " << hipGetErrorString(e) << "."; return sout.str(); diff --git a/paddle/fluid/platform/miopen_desc.h b/paddle/fluid/platform/miopen_desc.h new file mode 100644 index 0000000000000..68db32bac103b --- /dev/null +++ b/paddle/fluid/platform/miopen_desc.h @@ -0,0 +1,221 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/miopen_helper.h" + +namespace paddle { +namespace framework { +class Tensor; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace platform { +using framework::Tensor; + +template +inline miopenDataType_t ToMIOpenDataType(const T& t) { + auto type = framework::ToDataType(t); + return ToMIOpenDataType(type); +} + +inline std::vector TransformDimOrder(const std::vector& dims) { + std::vector transformed_dims(dims.begin(), dims.end()); + int H, W, D, C; + if (dims.size() == 4) { + H = dims[1]; + W = dims[2]; + C = dims[3]; + transformed_dims[1] = C; + transformed_dims[2] = H; + transformed_dims[3] = W; + } else { + D = dims[1]; + H = dims[2]; + W = dims[3]; + C = dims[4]; + transformed_dims[1] = C; + transformed_dims[2] = D; + transformed_dims[3] = H; + transformed_dims[4] = W; + } + return transformed_dims; +} + +template <> +inline miopenDataType_t ToMIOpenDataType( + const framework::proto::VarType::Type& t) { + miopenDataType_t type = miopenFloat; + switch (t) { + case framework::proto::VarType::FP16: + type = miopenHalf; + break; + case framework::proto::VarType::FP32: + type = miopenFloat; + break; + default: + break; + } + return type; +} + +class ActivationDescriptor { + public: + ActivationDescriptor() { + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::miopenCreateActivationDescriptor(&desc_)); + } + ~ActivationDescriptor() { + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::miopenDestroyActivationDescriptor(desc_)); + } + template + void set(miopenActivationMode_t mode, const T& coef) { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor( + desc_, mode, static_cast(coef), 0.0, 0.0)); + } + + miopenActivationDescriptor_t desc() { return desc_; } + miopenActivationDescriptor_t desc() const { return desc_; } + + private: + miopenActivationDescriptor_t desc_; +}; + +class TensorDescriptor { + public: + TensorDescriptor() { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_)); + } + ~TensorDescriptor() { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_)); + } + miopenTensorDescriptor_t desc() { return desc_; } + miopenTensorDescriptor_t desc() const { return desc_; } + + void set(const Tensor& tensor, const int groups = 1) { + auto dims = framework::vectorize(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + desc_, ToMIOpenDataType(tensor.type()), + static_cast(dims_with_group.size()), + const_cast(dims_with_group.data()), + const_cast(strides.data()))); + } + + void set(const Tensor& tensor, const miopenTensorFormat_t format) { + const int groups = 1; + auto dims = framework::vectorize(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + desc_, ToMIOpenDataType(tensor.type()), + static_cast(dims_with_group.size()), + const_cast(dims_with_group.data()), + const_cast(strides.data()))); + } + + private: + miopenTensorDescriptor_t desc_; +}; + +class FilterDescriptor { + public: + FilterDescriptor() { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_)); + } + ~FilterDescriptor() { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_)); + } + miopenTensorDescriptor_t desc() { return desc_; } + miopenTensorDescriptor_t desc() const { return desc_; } + + void set(const Tensor& tensor, const miopenTensorFormat_t format, + const int groups = 1) { + auto dims = framework::vectorize(tensor.dims()); + std::vector transformed_dims; + PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW, + platform::errors::InvalidArgument( + "format should ONLY be NCHW in MIOPEN.")); + transformed_dims = dims; + if (groups > 1) { + transformed_dims[1] = transformed_dims[1] / groups; + } + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + desc_, ToMIOpenDataType(tensor.type()), + static_cast(transformed_dims.size()), + const_cast(transformed_dims.data()), nullptr)); + } + + private: + miopenTensorDescriptor_t desc_; +}; + +class ConvolutionDescriptor { + public: + ConvolutionDescriptor() { + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::miopenCreateConvolutionDescriptor(&desc_)); + } + ~ConvolutionDescriptor() { + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::miopenDestroyConvolutionDescriptor(desc_)); + } + miopenConvolutionDescriptor_t desc() { return desc_; } + miopenConvolutionDescriptor_t desc() const { return desc_; } + + void set(miopenDataType_t dtype, const std::vector& pads, + const std::vector& strides, const std::vector& dilations, + bool allow_tf32, const int groups = 1) { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor( + desc_, static_cast(pads.size()), const_cast(pads.data()), + const_cast(strides.data()), const_cast(dilations.data()), + miopenConvolution)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenSetConvolutionGroupCount(desc_, groups)); + } + + private: + miopenConvolutionDescriptor_t desc_; +}; + +} // namespace platform +} // namespace paddle diff --git a/tools/dockerfile/Dockerfile.rocm b/tools/dockerfile/Dockerfile.rocm index 2f624b2d9784b..6ae6b8963b7f5 100644 --- a/tools/dockerfile/Dockerfile.rocm +++ b/tools/dockerfile/Dockerfile.rocm @@ -1,29 +1,18 @@ # A image for building paddle binaries # Use rocm-terminal base image for both rocm environment # When you modify it, please be aware of rocm version -# -# Build: ROCM 3.5.1 -# cd Paddle/tools/dockerfile -# docker build -f Dockerfile.rocm \ -# --build-arg ROCM_VERSION=3.5.1 \ -# --build-arg CENTOS_VERSION=7.7.1908 \ -# -t paddlepaddle/paddle-centos-rocm35-dev:latest . # -# Build: ROCM 3.9.1 +# Build: ROCM 3.9 # cd Paddle/tools/dockerfile # docker build -f Dockerfile.rocm \ -# --build-arg ROCM_VERSION=3.9.1 \ -# --build-arg CENTOS_VERSION=7.8.2003 \ +# --build-arg ROCM_VERSION=3.9 \ # -t paddlepaddle/paddle-centos-rocm39-dev:latest . # -# Run: ROCM 3.5.1 # docker run -it --device=/dev/kfd --device=/dev/dri \ # --security-opt seccomp=unconfined --group-add video \ -# paddlepaddle/paddle-centos-rocm35-dev:latest /bin/bash +# paddlepaddle/paddle-centos-rocm39-dev:latest /bin/bash -ARG CENTOS_VERSION -FROM centos:${CENTOS_VERSION} -ARG CENTOS_VERSION +FROM centos:7.8.2003 MAINTAINER PaddlePaddle Authors ENV LC_ALL en_US.UTF-8 @@ -34,7 +23,7 @@ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlit zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \ make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel -# Install devtoolset-7 for ROCM 3.5/3.9 +# Install devtoolset-7 RUN yum install -y yum-utils centos-release-scl && \ yum-config-manager --enable rhel-server-rhscl-7-rpms && \ yum-config-manager --enable rhel-7-server-rpms && \ @@ -70,10 +59,8 @@ ENV ROCM_PATH=/opt/rocm ENV HIP_PATH=/opt/rocm/hip ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin ENV PATH=/opt/rocm/bin:$PATH -ENV PATH=/opt/rocm/hcc/bin:$PATH -ENV PATH=/opt/rocm/hip/bin:$PATH ENV PATH=/opt/rocm/opencl/bin:$PATH -ENV PATH=/opt/rocm/llvm/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH # git 2.17.1 RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \ @@ -146,4 +133,12 @@ RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \ cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9 +# configure ssh +RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \ + sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \ + sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \ + sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config + +RUN ssh-keygen -A + EXPOSE 22