Skip to content

Commit

Permalink
[ROCM] fix concat and split (#55821)
Browse files Browse the repository at this point in the history
  • Loading branch information
ronny1996 authored Aug 1, 2023
1 parent a00f5bd commit d7aef89
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 56 deletions.
11 changes: 7 additions & 4 deletions cmake/hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,14 @@ list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl)
list(APPEND HIP_CXX_FLAGS -Wno-unused-value)
list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init)
list(APPEND HIP_CXX_FLAGS -Wno-return-type)
list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header)

if(WITH_CINN)
list(APPEND HIP_CXX_FLAGS -std=c++14)
else()
list(APPEND HIP_CXX_FLAGS -std=c++17)
endif()
list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024)

if(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND HIP_CXX_FLAGS -g2)
Expand All @@ -130,11 +133,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
# Ask hcc to generate device code during compilation so we can use
# host linker to link.
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908)
list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906)
list(APPEND HIP_HCC_FLAGS --offload-arch=gfx908)
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)
list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906)
list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx908)

if(HIP_COMPILER STREQUAL clang)
set(hip_library_name amdhip64)
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/common/bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@
#include <cuda_bf16.h>
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

namespace phi {
namespace dtype {
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/common/complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@
#include <thrust/complex.h> // NOLINT
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// todo
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/common/float16.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@
#include <hip/hip_fp16.h>
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)

Expand Down
56 changes: 4 additions & 52 deletions paddle/phi/kernels/funcs/concat_and_split_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context,
*grid_dims = dim3(grid_cols, grid_rows, 1);
}

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
#endif
#else
#define PADDLE_ALIGN(x)
#endif

template <typename T, int Size>
struct PointerWrapper {
Expand Down Expand Up @@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
std::vector<IndexT> inputs_col_vec(inputs_col_num, 0);
const T** inputs_data = inputs_data_vec.data();
IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function
phi::Allocator::AllocationPtr data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif

bool has_same_shape = true;
for (int i = 0; i < in_num; ++i) {
Expand All @@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
in_num,
limit_num,
has_same_shape);

#ifdef PADDLE_WITH_HIP
// Prevent pinned memory from being covered and release the memory after
// kernel launch of the stream is executed (reapply pinned memory next time)
auto* data_alloc_released = data_alloc.release();
auto* col_alloc_released = col_alloc.release();
ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(col_alloc_released);
});
#endif
}

template <typename T>
Expand Down Expand Up @@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType(
IndexT* outs_cols = outputs_cols_vec.data();
T** outs_data = nullptr;

// There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from
// hosttodevice, it will be automatically asynchronous.
// However, only pinned memory in hip can copy asynchronously
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
(out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif

outs_cols[0] = 0;
for (int i = 0; i < out_num; ++i) {
IndexT t_col = ref_ins.at(i)->numel() / out_row;
Expand Down Expand Up @@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType(
outs_data));
}
}

#ifdef PADDLE_WITH_HIP
// Prevent pinned memory from being covered and release the memory after
// kernel launch of the stream is executed (reapply pinned memory next time)
auto* data_alloc_released = data_alloc.release();
auto* cols_alloc_released = cols_alloc.release();
ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(cols_alloc_released);
});
#endif
}

template <typename T>
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/kernels/funcs/segmented_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@
namespace phi {
namespace funcs {

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
#endif
#else
#define PADDLE_ALIGN(x)
#endif

enum class SegmentedArraySize {
kVariableLength = 0,
Expand Down

0 comments on commit d7aef89

Please sign in to comment.