Skip to content

Commit

Permalink
[cherry-pick] Fix compilation errors on dtk23.04 (#56533)
Browse files Browse the repository at this point in the history
* [cherry-pick] Fix compilation errors on dtk23.04

* update

* update
  • Loading branch information
ronny1996 authored Aug 24, 2023
1 parent 63e3e23 commit 09ed607
Show file tree
Hide file tree
Showing 13 changed files with 149 additions and 68 deletions.
16 changes: 13 additions & 3 deletions cmake/cupti.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@ if(NOT WITH_GPU AND NOT WITH_ROCM)
return()
endif()

set(CUPTI_ROOT
"/usr"
CACHE PATH "CUPTI ROOT")
if(WITH_ROCM)
set(CUPTI_ROOT
"${ROCM_PATH}/cuda/extras/CUPTI"
CACHE PATH "CUPTI ROOT")
else()
set(CUPTI_ROOT
"/usr"
CACHE PATH "CUPTI ROOT")
endif()
find_path(
CUPTI_INCLUDE_DIR cupti.h
PATHS ${CUPTI_ROOT}
Expand Down Expand Up @@ -46,6 +52,10 @@ find_library(
get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
set(CUPTI_FOUND ON)
if(WITH_ROCM)
include_directories(${ROCM_PATH}/cuda/include)
add_definitions(-D__CUDA_HIP_PLATFORM_AMD__)
endif()
else()
set(CUPTI_FOUND OFF)
endif()
17 changes: 13 additions & 4 deletions cmake/hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,15 @@ list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion)
list(APPEND HIP_CXX_FLAGS -Wno-pass-failed)
list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
list(APPEND HIP_CXX_FLAGS -std=c++14)
list(APPEND HIP_CXX_FLAGS -Wno-unused-result)
list(APPEND HIP_CXX_FLAGS -Wno-deprecated-declarations)
list(APPEND HIP_CXX_FLAGS -Wno-format)
list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl)
list(APPEND HIP_CXX_FLAGS -Wno-unused-value)
list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init)
list(APPEND HIP_CXX_FLAGS -Wno-return-type)
list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header)
list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024)

if(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND HIP_CXX_FLAGS -g2)
Expand All @@ -119,11 +128,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
# Ask hcc to generate device code during compilation so we can use
# host linker to link.
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908)
list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906)
list(APPEND HIP_HCC_FLAGS --offload-arch=gfx908)
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)
list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906)
list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx908)

if(HIP_COMPILER STREQUAL clang)
set(hip_library_name amdhip64)
Expand Down
7 changes: 7 additions & 0 deletions paddle/phi/backends/dynload/rccl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,16 @@ RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
#endif

#if NCCL_VERSION_CODE >= 2304
RCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
#endif

#if NCCL_VERSION_CODE >= 2703
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
#endif

#if NCCL_VERSION_CODE >= 21100
RCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
#endif
} // namespace dynload
} // namespace phi
11 changes: 11 additions & 0 deletions paddle/phi/backends/dynload/rccl.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,23 @@ RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif

#if NCCL_VERSION_CODE >= 2304
#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
RCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif

#if NCCL_VERSION_CODE >= 2703
#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \
__macro(ncclRecv);
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif

#if NCCL_VERSION_CODE >= 21100
#define RCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
__macro(ncclRedOpCreatePreMulSum); \
__macro(ncclRedOpDestroy);
RCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
} // namespace dynload
} // namespace phi
4 changes: 4 additions & 0 deletions paddle/phi/common/bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@
#include <cuda_bf16.h>
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

namespace phi {
namespace dtype {
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/common/complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,15 @@
#include <thrust/complex.h> // NOLINT
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// todo
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/common/float16.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,15 @@
#include <hip/hip_fp16.h>
#endif

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x) __declspec(align(x))
#endif
#else
#define PADDLE_ALIGN(x)
#endif

#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)

Expand Down
60 changes: 54 additions & 6 deletions paddle/phi/kernels/funcs/blas/blas_impl.hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -999,12 +999,10 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransA =
transA ? rocblas_operation_none : rocblas_operation_transpose;
rocblas_operation cuTransB =
transB ? rocblas_operation_none : rocblas_operation_transpose;
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
80,
Expand Down Expand Up @@ -1175,6 +1173,56 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
});
}

template <>
template <>
inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB,
int M,
int N,
int K,
float16 alpha,
const float16 *A,
const float16 *B,
float16 beta,
float16 *C,
int batchCount,
int64_t strideA,
int64_t strideB) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
const int64_t strideC = M * N;
context_.CublasCall([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm_strided_batched(
handle,
cuTransB,
cuTransA,
N,
M,
K,
reinterpret_cast<const rocblas_half *>(&alpha),
reinterpret_cast<const rocblas_half *>(B),
ldb,
strideB,
reinterpret_cast<const rocblas_half *>(A),
lda,
strideA,
reinterpret_cast<const rocblas_half *>(&beta),
reinterpret_cast<rocblas_half *>(C),
ldc,
strideC,
batchCount));
});
}

// note(wangran16): unknown bug. parameters dislocation when calling
// GEMM_STRIDED_BATCH<float> and GEMM_STRIDED_BATCH<double>
template <>
Expand Down
56 changes: 4 additions & 52 deletions paddle/phi/kernels/funcs/concat_and_split_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context,
*grid_dims = dim3(grid_cols, grid_rows, 1);
}

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
#endif
#else
#define PADDLE_ALIGN(x)
#endif

template <typename T, int Size>
struct PointerWrapper {
Expand Down Expand Up @@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
std::vector<IndexT> inputs_col_vec(inputs_col_num, 0);
const T** inputs_data = inputs_data_vec.data();
IndexT* inputs_col = inputs_col_vec.data();
#ifdef PADDLE_WITH_HIP
// TODO(chentianyu03): try to find a method to remove the Alloc function
phi::Allocator::AllocationPtr data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
#endif

bool has_same_shape = true;
for (int i = 0; i < in_num; ++i) {
Expand All @@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
in_num,
limit_num,
has_same_shape);

#ifdef PADDLE_WITH_HIP
// Prevent pinned memory from being covered and release the memory after
// kernel launch of the stream is executed (reapply pinned memory next time)
auto* data_alloc_released = data_alloc.release();
auto* col_alloc_released = col_alloc.release();
ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(col_alloc_released);
});
#endif
}

template <typename T>
Expand Down Expand Up @@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType(
IndexT* outs_cols = outputs_cols_vec.data();
T** outs_data = nullptr;

// There are some differences between hip runtime and NV runtime.
// In NV, when the pageable memory data less than 64K is transferred from
// hosttodevice, it will be automatically asynchronous.
// However, only pinned memory in hip can copy asynchronously
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
// 3.2.6.1. Concurrent Execution between Host and Device
// Memory copies from host to device of a memory block of 64 KB or less
#ifdef PADDLE_WITH_HIP
phi::Allocator::AllocationPtr data_alloc, cols_alloc;
// TODO(chentianyu03): try to find a method to remove the Alloc function
data_alloc =
phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
outs_data = reinterpret_cast<T**>(data_alloc->ptr());
// TODO(chentianyu03): try to find a method to remove the Alloc function
cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
(out_cols_num) * sizeof(IndexT));
outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
#endif

outs_cols[0] = 0;
for (int i = 0; i < out_num; ++i) {
IndexT t_col = ref_ins.at(i)->numel() / out_row;
Expand Down Expand Up @@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType(
outs_data));
}
}

#ifdef PADDLE_WITH_HIP
// Prevent pinned memory from being covered and release the memory after
// kernel launch of the stream is executed (reapply pinned memory next time)
auto* data_alloc_released = data_alloc.release();
auto* cols_alloc_released = cols_alloc.release();
ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
phi::memory_utils::AllocationDeleter(data_alloc_released);
phi::memory_utils::AllocationDeleter(cols_alloc_released);
});
#endif
}

template <typename T>
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/kernels/funcs/segmented_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@
namespace phi {
namespace funcs {

#ifndef PADDLE_WITH_HIP
#if !defined(_WIN32)
#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
#else
#define PADDLE_ALIGN(x)
#endif
#else
#define PADDLE_ALIGN(x)
#endif

enum class SegmentedArraySize {
kVariableLength = 0,
Expand Down
9 changes: 9 additions & 0 deletions paddle/phi/kernels/funcs/top_k_function_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ struct radix_key_codec_base<phi::dtype::float16>
template <>
struct radix_key_codec_base<phi::dtype::bfloat16>
: radix_key_codec_integral<phi::dtype::bfloat16, uint16_t> {};

#if HIP_VERSION >= 50400000
template <>
struct float_bit_mask<phi::dtype::float16> : float_bit_mask<rocprim::half> {};

template <>
struct float_bit_mask<phi::dtype::bfloat16>
: float_bit_mask<rocprim::bfloat16> {};
#endif
} // namespace detail
} // namespace rocprim
namespace cub = hipcub;
Expand Down
19 changes: 19 additions & 0 deletions paddle/phi/kernels/gpu/argsort_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,19 @@ namespace detail {
template <>
struct radix_key_codec_base<phi::dtype::float16>
: radix_key_codec_integral<phi::dtype::float16, uint16_t> {};

template <>
struct radix_key_codec_base<phi::dtype::bfloat16>
: radix_key_codec_integral<phi::dtype::bfloat16, uint16_t> {};

#if HIP_VERSION >= 50400000
template <>
struct float_bit_mask<phi::dtype::float16> : float_bit_mask<rocprim::half> {};

template <>
struct float_bit_mask<phi::dtype::bfloat16>
: float_bit_mask<rocprim::bfloat16> {};
#endif
} // namespace detail
} // namespace rocprim
#else
Expand All @@ -48,7 +61,13 @@ namespace cub {
template <>
struct NumericTraits<phi::dtype::float16>
: BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};

template <>
struct NumericTraits<phi::dtype::bfloat16>
: BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::bfloat16> {
};
} // namespace cub

#endif

namespace phi {
Expand Down
Loading

0 comments on commit 09ed607

Please sign in to comment.