fix hpi (PaddlePaddle#204)

DesmonDay · Feb 2, 2023 · 1f69e92 · 1f69e92
1 parent f91ae8d
commit 1f69e92
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 3 deletions.
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -47,8 +47,11 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
   }
   const size_t& memset_bytes = memset_size * sizeof(T);
 
+#ifdef PADDLE_WITH_HIP
+  hipMemset(p_output, 0, memset_bytes);
+#else
   cudaMemsetAsync(p_output, 0, memset_bytes, ctx.stream());
-
+#endif
   if (index_size == 0) return;
 
   int64_t slice_size = 1;

diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -124,9 +124,11 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
     dst_count->Resize({input_size});
     ctx.template Alloc<int32_t>(dst_count);
     int* p_dst_count = dst_count->data<int>();
-
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_dst_count, 0, input_size * sizeof(int));
+#else
     cudaMemsetAsync(p_dst_count, 0, input_size * sizeof(int), ctx.stream());
-
+#endif
     int64_t grid_count = (index_size + block - 1) / block;
     ComputeCountCUDAKernel<T, IndexT><<<grid_count, block, 0, ctx.stream()>>>(
         p_dst_count, d_index, index_size);

diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -196,8 +196,12 @@ static void UniqueFlattendCUDATensor(const Context& context,
   indices->Resize(phi::make_ddim({num_input}));
   auto* indices_data = context.template Alloc<IndexT>(indices);
 
+#ifdef PADDLE_WITH_CUDA
   paddle::memory::ThrustAllocator<cudaStream_t> allocator(context.GetPlace(), context.stream());
   const auto &exec_policy = thrust::cuda::par(allocator).on(context.stream());
+#else
+  const auto &exec_policy = thrust::hip::par.on(context.stream());
+#endif
 
   thrust::sequence(exec_policy, indices_data, indices_data + num_input);
   thrust::sort_by_key(
@@ -232,7 +236,11 @@ static void UniqueFlattendCUDATensor(const Context& context,
                                 in_data_hat + num_input,
                                 inv_loc_data_ptr,
                                 not_equal);
+#ifdef PADDLE_WITH_HIP
+    hipMemset(inv_loc_data_ptr, 0, sizeof(IndexT));
+#else
     cudaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream());
+#endif
     size_t temp_storage_bytes = 0;
     cub::DeviceScan::InclusiveSum(NULL,
                                   temp_storage_bytes,
@@ -305,8 +313,12 @@ static void ComputeUniqueDims(const Context& context,
                               equal_T equal,
                               not_equal_T not_equal,
                               int64_t row) {
+#ifdef PADDLE_WITH_CUDA
   paddle::memory::ThrustAllocator<cudaStream_t> allocator(context.GetPlace(), context.stream());
   const auto &exec_policy = thrust::cuda::par(allocator).on(context.stream());
+#else
+  const auto &exec_policy = thrust::hip::par.on(context.stream());
+#endif
   // 1. inverse indices: 'inverse'
   inverse->Resize(phi::make_ddim({row}));
   auto* inverse_data = context.template Alloc<IndexT>(inverse);
@@ -401,8 +413,12 @@ static void UniqueDimsCUDATensor(const Context& context,
 
   // 2. Calculate 'indices', 'inverse', 'counts'
   // Init index and sort  
+#ifdef PADDLE_WITH_CUDA
   paddle::memory::ThrustAllocator<cudaStream_t> allocator(context.GetPlace(), context.stream());
   const auto &exec_policy = thrust::cuda::par(allocator).on(context.stream());
+#else
+  const auto &exec_policy = thrust::hip::par.on(context.stream());
+#endif
   thrust::sequence(
       exec_policy, sorted_indices_data, sorted_indices_data + row);
   thrust::sort(exec_policy,