From 0ff8d9ced7a0780b2107e905262d3aca5a78c7e7 Mon Sep 17 00:00:00 2001 From: ronny1996 Date: Mon, 31 Jul 2023 02:28:32 +0000 Subject: [PATCH] [ROCM] fix concat and split --- cmake/hip.cmake | 11 ++-- paddle/phi/common/bfloat16.h | 4 ++ paddle/phi/common/complex.h | 4 ++ paddle/phi/common/float16.h | 4 ++ .../kernels/funcs/concat_and_split_functor.cu | 56 ++----------------- paddle/phi/kernels/funcs/segmented_array.h | 4 ++ 6 files changed, 27 insertions(+), 56 deletions(-) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 83c283a10d5769..51142899513180 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -113,11 +113,14 @@ list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl) list(APPEND HIP_CXX_FLAGS -Wno-unused-value) list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init) list(APPEND HIP_CXX_FLAGS -Wno-return-type) +list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header) + if(WITH_CINN) list(APPEND HIP_CXX_FLAGS -std=c++14) else() list(APPEND HIP_CXX_FLAGS -std=c++17) endif() +list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024) if(CMAKE_BUILD_TYPE MATCHES Debug) list(APPEND HIP_CXX_FLAGS -g2) @@ -130,11 +133,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) # Ask hcc to generate device code during compilation so we can use # host linker to link. list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) -list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906) -list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908) +list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906) +list(APPEND HIP_HCC_FLAGS --offload-arch=gfx908) list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) -list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906) -list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908) +list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906) +list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx908) if(HIP_COMPILER STREQUAL clang) set(hip_library_name amdhip64) diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 7ea9b0cbb64777..ccaf9b82bc5a53 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -31,11 +31,15 @@ #include #endif +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else #define PADDLE_ALIGN(x) __declspec(align(x)) #endif +#else +#define PADDLE_ALIGN(x) +#endif namespace phi { namespace dtype { diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h index 833ddcf46b2fee..130047f850426e 100644 --- a/paddle/phi/common/complex.h +++ b/paddle/phi/common/complex.h @@ -31,11 +31,15 @@ #include // NOLINT #endif +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else #define PADDLE_ALIGN(x) __declspec(align(x)) #endif +#else +#define PADDLE_ALIGN(x) +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // todo diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 86168d441ded2a..9d60b8c6241ae3 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -51,11 +51,15 @@ #include #endif +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else #define PADDLE_ALIGN(x) __declspec(align(x)) #endif +#else +#define PADDLE_ALIGN(x) +#endif #define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600) diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 5a7574b56a8917..d2e01503d43b25 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context, *grid_dims = dim3(grid_cols, grid_rows, 1); } +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else #define PADDLE_ALIGN(x) #endif +#else +#define PADDLE_ALIGN(x) +#endif template struct PointerWrapper { @@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx, std::vector inputs_col_vec(inputs_col_num, 0); const T** inputs_data = inputs_data_vec.data(); IndexT* inputs_col = inputs_col_vec.data(); -#ifdef PADDLE_WITH_HIP - // TODO(chentianyu03): try to find a method to remove the Alloc function - phi::Allocator::AllocationPtr data_alloc = - phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*)); - inputs_data = reinterpret_cast(data_alloc->ptr()); - phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc( - phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT)); - inputs_col = reinterpret_cast(col_alloc->ptr()); -#endif bool has_same_shape = true; for (int i = 0; i < in_num; ++i) { @@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx, in_num, limit_num, has_same_shape); - -#ifdef PADDLE_WITH_HIP - // Prevent pinned memory from being covered and release the memory after - // kernel launch of the stream is executed (reapply pinned memory next time) - auto* data_alloc_released = data_alloc.release(); - auto* col_alloc_released = col_alloc.release(); - ctx.AddStreamCallback([data_alloc_released, col_alloc_released] { - VLOG(4) << "Delete cuda pinned at " << data_alloc_released; - VLOG(4) << "Delete cuda pinned at " << col_alloc_released; - phi::memory_utils::AllocationDeleter(data_alloc_released); - phi::memory_utils::AllocationDeleter(col_alloc_released); - }); -#endif } template @@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType( IndexT* outs_cols = outputs_cols_vec.data(); T** outs_data = nullptr; -// There are some differences between hip runtime and NV runtime. -// In NV, when the pageable memory data less than 64K is transferred from -// hosttodevice, it will be automatically asynchronous. -// However, only pinned memory in hip can copy asynchronously -// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device -// 3.2.6.1. Concurrent Execution between Host and Device -// Memory copies from host to device of a memory block of 64 KB or less -#ifdef PADDLE_WITH_HIP - phi::Allocator::AllocationPtr data_alloc, cols_alloc; - // TODO(chentianyu03): try to find a method to remove the Alloc function - data_alloc = - phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*)); - outs_data = reinterpret_cast(data_alloc->ptr()); - // TODO(chentianyu03): try to find a method to remove the Alloc function - cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(), - (out_cols_num) * sizeof(IndexT)); - outs_cols = reinterpret_cast(cols_alloc->ptr()); -#endif - outs_cols[0] = 0; for (int i = 0; i < out_num; ++i) { IndexT t_col = ref_ins.at(i)->numel() / out_row; @@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType( outs_data)); } } - -#ifdef PADDLE_WITH_HIP - // Prevent pinned memory from being covered and release the memory after - // kernel launch of the stream is executed (reapply pinned memory next time) - auto* data_alloc_released = data_alloc.release(); - auto* cols_alloc_released = cols_alloc.release(); - ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] { - phi::memory_utils::AllocationDeleter(data_alloc_released); - phi::memory_utils::AllocationDeleter(cols_alloc_released); - }); -#endif } template diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h index 60e25a95747ead..e6ecb9819e5054 100644 --- a/paddle/phi/kernels/funcs/segmented_array.h +++ b/paddle/phi/kernels/funcs/segmented_array.h @@ -21,11 +21,15 @@ namespace phi { namespace funcs { +#ifndef PADDLE_WITH_HIP #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else #define PADDLE_ALIGN(x) #endif +#else +#define PADDLE_ALIGN(x) +#endif enum class SegmentedArraySize { kVariableLength = 0,