[cherry-pick] Fix compilation errors on dtk23.04 (#56533)

* [cherry-pick] Fix compilation errors on dtk23.04 * update * update
PaddlePaddle · Aug 24, 2023 · 09ed607 · 09ed607
1 parent 63e3e23
commit 09ed607
Show file tree

Hide file tree

Showing 13 changed files with 149 additions and 68 deletions.
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
@@ -2,9 +2,15 @@ if(NOT WITH_GPU AND NOT WITH_ROCM)
   return()
 endif()
 
-set(CUPTI_ROOT
-    "/usr"
-    CACHE PATH "CUPTI ROOT")
+if(WITH_ROCM)
+  set(CUPTI_ROOT
+      "${ROCM_PATH}/cuda/extras/CUPTI"
+      CACHE PATH "CUPTI ROOT")
+else()
+  set(CUPTI_ROOT
+      "/usr"
+      CACHE PATH "CUPTI ROOT")
+endif()
 find_path(
   CUPTI_INCLUDE_DIR cupti.h
   PATHS ${CUPTI_ROOT}
@@ -46,6 +52,10 @@ find_library(
 get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
 if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
   set(CUPTI_FOUND ON)
+  if(WITH_ROCM)
+    include_directories(${ROCM_PATH}/cuda/include)
+    add_definitions(-D__CUDA_HIP_PLATFORM_AMD__)
+  endif()
 else()
   set(CUPTI_FOUND OFF)
 endif()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
@@ -107,6 +107,15 @@ list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion)
 list(APPEND HIP_CXX_FLAGS -Wno-pass-failed)
 list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
 list(APPEND HIP_CXX_FLAGS -std=c++14)
+list(APPEND HIP_CXX_FLAGS -Wno-unused-result)
+list(APPEND HIP_CXX_FLAGS -Wno-deprecated-declarations)
+list(APPEND HIP_CXX_FLAGS -Wno-format)
+list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl)
+list(APPEND HIP_CXX_FLAGS -Wno-unused-value)
+list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init)
+list(APPEND HIP_CXX_FLAGS -Wno-return-type)
+list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header)
+list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024)
 
 if(CMAKE_BUILD_TYPE MATCHES Debug)
   list(APPEND HIP_CXX_FLAGS -g2)
@@ -119,11 +128,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
 # Ask hcc to generate device code during compilation so we can use
 # host linker to link.
 list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
-list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
-list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908)
+list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906)
+list(APPEND HIP_HCC_FLAGS --offload-arch=gfx908)
 list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
-list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
-list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)
+list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906)
+list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx908)
 
 if(HIP_COMPILER STREQUAL clang)
   set(hip_library_name amdhip64)

diff --git a/paddle/phi/backends/dynload/rccl.cc b/paddle/phi/backends/dynload/rccl.cc
@@ -28,9 +28,16 @@ RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
 #endif
 
+#if NCCL_VERSION_CODE >= 2304
+RCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
+#endif
+
 #if NCCL_VERSION_CODE >= 2703
 RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
 #endif
 
+#if NCCL_VERSION_CODE >= 21100
+RCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
+#endif
 }  // namespace dynload
 }  // namespace phi
diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h
@@ -64,12 +64,23 @@ RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
+#if NCCL_VERSION_CODE >= 2304
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
+RCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+#endif
+
 #if NCCL_VERSION_CODE >= 2703
 #define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
   __macro(ncclSend);                               \
   __macro(ncclRecv);
 RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
+#if NCCL_VERSION_CODE >= 21100
+#define RCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
+  __macro(ncclRedOpCreatePreMulSum);                \
+  __macro(ncclRedOpDestroy);
+RCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
+#endif
 }  // namespace dynload
 }  // namespace phi
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
@@ -31,11 +31,15 @@
 #include <cuda_bf16.h>
 #endif
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 namespace phi {
 namespace dtype {

diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
@@ -31,11 +31,15 @@
 #include <thrust/complex.h>  // NOLINT
 #endif
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // todo

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
@@ -51,11 +51,15 @@
 #include <hip/hip_fp16.h>
 #endif
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 #define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
 

diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -999,12 +999,10 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
                                         int ldc) const {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
-  rocblas_operation cuTransA = (transA == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
-  rocblas_operation cuTransB = (transB == CblasNoTrans)
-                                   ? rocblas_operation_none
-                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransA =
+      transA ? rocblas_operation_none : rocblas_operation_transpose;
+  rocblas_operation cuTransB =
+      transB ? rocblas_operation_none : rocblas_operation_transpose;
   PADDLE_ENFORCE_GE(
       context_.GetComputeCapability(),
       80,
@@ -1175,6 +1173,56 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float16 alpha,
+                                               const float16 *A,
+                                               const float16 *B,
+                                               float16 beta,
+                                               float16 *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  const int64_t strideC = M * N;
+  context_.CublasCall([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm_strided_batched(
+        handle,
+        cuTransB,
+        cuTransA,
+        N,
+        M,
+        K,
+        reinterpret_cast<const rocblas_half *>(&alpha),
+        reinterpret_cast<const rocblas_half *>(B),
+        ldb,
+        strideB,
+        reinterpret_cast<const rocblas_half *>(A),
+        lda,
+        strideA,
+        reinterpret_cast<const rocblas_half *>(&beta),
+        reinterpret_cast<rocblas_half *>(C),
+        ldc,
+        strideC,
+        batchCount));
+  });
+}
+
 // note(wangran16): unknown bug. parameters dislocation when calling
 // GEMM_STRIDED_BATCH<float> and GEMM_STRIDED_BATCH<double>
 template <>

diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context,
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x)
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 template <typename T, int Size>
 struct PointerWrapper {
@@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
   std::vector<IndexT> inputs_col_vec(inputs_col_num, 0);
   const T** inputs_data = inputs_data_vec.data();
   IndexT* inputs_col = inputs_col_vec.data();
-#ifdef PADDLE_WITH_HIP
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  phi::Allocator::AllocationPtr data_alloc =
-      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
-  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-  phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
-      phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
-  inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
-#endif
 
   bool has_same_shape = true;
   for (int i = 0; i < in_num; ++i) {
@@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
                                   in_num,
                                   limit_num,
                                   has_same_shape);
-
-#ifdef PADDLE_WITH_HIP
-  // Prevent pinned memory from being covered and release the memory after
-  // kernel launch of the stream is executed (reapply pinned memory next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* col_alloc_released = col_alloc.release();
-  ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
-    VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
-    VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
-    phi::memory_utils::AllocationDeleter(data_alloc_released);
-    phi::memory_utils::AllocationDeleter(col_alloc_released);
-  });
-#endif
 }
 
 template <typename T>
@@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType(
   IndexT* outs_cols = outputs_cols_vec.data();
   T** outs_data = nullptr;
 
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-  phi::Allocator::AllocationPtr data_alloc, cols_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc =
-      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
-  outs_data = reinterpret_cast<T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
-                                        (out_cols_num) * sizeof(IndexT));
-  outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
-#endif
-
   outs_cols[0] = 0;
   for (int i = 0; i < out_num; ++i) {
     IndexT t_col = ref_ins.at(i)->numel() / out_row;
@@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType(
               outs_data));
     }
   }
-
-#ifdef PADDLE_WITH_HIP
-  // Prevent pinned memory from being covered and release the memory after
-  // kernel launch of the stream is executed (reapply pinned memory next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* cols_alloc_released = cols_alloc.release();
-  ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-    phi::memory_utils::AllocationDeleter(data_alloc_released);
-    phi::memory_utils::AllocationDeleter(cols_alloc_released);
-  });
-#endif
 }
 
 template <typename T>

diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
@@ -21,11 +21,15 @@
 namespace phi {
 namespace funcs {
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x)
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 enum class SegmentedArraySize {
   kVariableLength = 0,

diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -54,6 +54,15 @@ struct radix_key_codec_base<phi::dtype::float16>
 template <>
 struct radix_key_codec_base<phi::dtype::bfloat16>
     : radix_key_codec_integral<phi::dtype::bfloat16, uint16_t> {};
+
+#if HIP_VERSION >= 50400000
+template <>
+struct float_bit_mask<phi::dtype::float16> : float_bit_mask<rocprim::half> {};
+
+template <>
+struct float_bit_mask<phi::dtype::bfloat16>
+    : float_bit_mask<rocprim::bfloat16> {};
+#endif
 }  // namespace detail
 }  // namespace rocprim
 namespace cub = hipcub;

diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -40,6 +40,19 @@ namespace detail {
 template <>
 struct radix_key_codec_base<phi::dtype::float16>
     : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+
+template <>
+struct radix_key_codec_base<phi::dtype::bfloat16>
+    : radix_key_codec_integral<phi::dtype::bfloat16, uint16_t> {};
+
+#if HIP_VERSION >= 50400000
+template <>
+struct float_bit_mask<phi::dtype::float16> : float_bit_mask<rocprim::half> {};
+
+template <>
+struct float_bit_mask<phi::dtype::bfloat16>
+    : float_bit_mask<rocprim::bfloat16> {};
+#endif
 }  // namespace detail
 }  // namespace rocprim
 #else
@@ -48,7 +61,13 @@ namespace cub {
 template <>
 struct NumericTraits<phi::dtype::float16>
     : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+
+template <>
+struct NumericTraits<phi::dtype::bfloat16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::bfloat16> {
+};
 }  // namespace cub
+
 #endif
 
 namespace phi {