Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCM] fix concat and split #55821

Merged
merged 1 commit into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions cmake/hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,14 @@ list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl)
list(APPEND HIP_CXX_FLAGS -Wno-unused-value)
list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init)
list(APPEND HIP_CXX_FLAGS -Wno-return-type)
list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header)

if(WITH_CINN)
list(APPEND HIP_CXX_FLAGS -std=c++14)
else()
list(APPEND HIP_CXX_FLAGS -std=c++17)
endif()
list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024)

if(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND HIP_CXX_FLAGS -g2)
Expand All @@ -130,11 +133,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
# Ask hcc to generate device code during compilation so we can use
# host linker to link.
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908)
list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906)
list(APPEND HIP_HCC_FLAGS --offload-arch=gfx908)
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)
list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906)
list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx908)

if(HIP_COMPILER STREQUAL clang)
set(hip_library_name amdhip64)
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/common/bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@
#include <cuda_bf16.h>
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

namespace phi {
namespace dtype {
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/common/complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@
#include <thrust/complex.h> // NOLINT
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// todo
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/common/float16.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@
#include <hip/hip_fp16.h>
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)

Expand Down
56 changes: 4 additions & 52 deletions paddle/phi/kernels/funcs/concat_and_split_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context,
*grid_dims = dim3(grid_cols, grid_rows, 1);
}

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
#endif
#else
#define PADDLE_ALIGN(x)
#endif

template <typename T, int Size>
struct PointerWrapper {
Expand Down Expand Up @@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
std::vector<IndexT> inputs_col_vec(inputs_col_num, 0);
const T** inputs_data = inputs_data_vec.data();
IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function
phi::Allocator::AllocationPtr data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif

bool has_same_shape = true;
for (int i = 0; i < in_num; ++i) {
Expand All @@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
in_num,
limit_num,
has_same_shape);

#ifdef PADDLE_WITH_HIP
// Prevent pinned memory from being covered and release the memory after
// kernel launch of the stream is executed (reapply pinned memory next time)
auto* data_alloc_released = data_alloc.release();
auto* col_alloc_released = col_alloc.release();
ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(col_alloc_released);
});
#endif
}

template <typename T>
Expand Down Expand Up @@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType(
IndexT* outs_cols = outputs_cols_vec.data();
T** outs_data = nullptr;

// There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from
// hosttodevice, it will be automatically asynchronous.
// However, only pinned memory in hip can copy asynchronously
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
(out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif

outs_cols[0] = 0;
for (int i = 0; i < out_num; ++i) {
IndexT t_col = ref_ins.at(i)->numel() / out_row;
Expand Down Expand Up @@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType(
outs_data));
}
}

#ifdef PADDLE_WITH_HIP
// Prevent pinned memory from being covered and release the memory after
// kernel launch of the stream is executed (reapply pinned memory next time)
auto* data_alloc_released = data_alloc.release();
auto* cols_alloc_released = cols_alloc.release();
ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(cols_alloc_released);
});
#endif
}

template <typename T>
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/kernels/funcs/segmented_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@
namespace phi {
namespace funcs {

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
#endif
#else
#define PADDLE_ALIGN(x)
#endif

enum class SegmentedArraySize {
kVariableLength = 0,
Expand Down