From 5d028c4df824685b091951e79aa2703aa035c247 Mon Sep 17 00:00:00 2001
From: Denghui Lu <denghuilu@pku.edu.cn>
Date: Thu, 26 Aug 2021 07:43:12 +0800
Subject: [PATCH] speedup scan_nlist kernel  (#1028)

* speedup cuda kernel scan_nlist

* fix no-pbc error
---
 source/lib/src/cuda/neighbor_list.cu     | 97 +++++++++++++++--------
 source/lib/src/rocm/neighbor_list.hip.cu | 98 ++++++++++++++++--------
 2 files changed, 133 insertions(+), 62 deletions(-)
diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu
index 66bd122079..430c869f4e 100644
--- a/source/lib/src/cuda/neighbor_list.cu
+++ b/source/lib/src/cuda/neighbor_list.cu
@@ -2,6 +2,69 @@
 #include "gpu_cuda.h"
 #include "neighbor_list.h"
 
+#include <cub/block/block_scan.cuh>
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+struct parallel_prefix_scan_op
+{
+  // Running prefix
+  int running_total;
+  // Constructor
+  __device__ parallel_prefix_scan_op(int running_total) : running_total(running_total) {}
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  __device__ int operator()(int block_aggregate)
+  {
+    int old_prefix = running_total;
+    running_total += block_aggregate;
+    return old_prefix;
+  }
+};
+
+template <
+  int   THREADS_PER_BLOCK>
+__global__ void parallel_prefix_scan(
+  int * numneigh, 
+  int * nei_order, 
+  const int * temp_nlist, 
+  const int mem_size, 
+  const int nloc,
+  const int nall
+)
+{
+  // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+  typedef cub::BlockScan<int,  THREADS_PER_BLOCK> BlockScan;
+  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // Initialize running total
+  parallel_prefix_scan_op prefix_op(0);
+
+  // Have the block iterate over segments of items
+  for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK)
+  {
+    int block_offset = blockIdx.x * mem_size; 
+    // Load a segment of consecutive items that are blocked across threads
+    int i_data = temp_nlist[block_offset + ii];
+    int o_data = i_data == -1 ? 0 : 1;
+
+    // Collectively compute the block-wide exclusive prefix sum
+    BlockScan(temp_storage).ExclusiveSum(
+        o_data, o_data, prefix_op);
+
+    __syncthreads();
+    // Store scanned items to output segment
+    if (i_data != -1) {
+        nei_order[block_offset + ii] = o_data; 
+    }
+    // Store numneigh into the output array
+    if (ii == nall - 1) {
+        o_data += i_data == -1 ? 0 : 1;
+        numneigh[blockIdx.x] = o_data; 
+    }
+  }
+}
+
 template<typename FPTYPE>
 __device__ inline FPTYPE dev_dot(
     FPTYPE * arr1, 
@@ -45,29 +108,6 @@ __global__ void build_nlist(
     }
 }
 
-__global__ void scan_nlist(
-    int * numneigh, 
-    int * nei_order, 
-    const int * temp_nlist, 
-    const int mem_size, 
-    const int nloc,
-    const int nall)
-{
-    const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if(atom_idx<nloc){
-        const int * row_nlist = temp_nlist + atom_idx * mem_size;
-        int * row_order = nei_order + atom_idx * mem_size;
-        int nei_num=0;
-        for(int i=0;i<nall;i++){
-            if(row_nlist[i]!=-1){
-                row_order[i]=nei_num;
-                nei_num++;
-            }
-        }
-        numneigh[atom_idx]=nei_num;
-    }
-}
-
 __global__ void fill_nlist(
     int ** firstneigh,
     const int * temp_nlist,
@@ -143,14 +183,9 @@ int build_nlist_gpu(
                 mem_size);
     DPErrcheck(cudaGetLastError());
     DPErrcheck(cudaDeviceSynchronize());
-    const int nblock_ = (nloc+TPB-1)/TPB;
-    scan_nlist<<<nblock_, TPB>>>(
-                numneigh, 
-                nei_order, 
-                temp_nlist, 
-                mem_size, 
-                nloc, 
-                nall);
+    parallel_prefix_scan<TPB> <<<nloc, TPB>>>(
+      numneigh, nei_order, 
+      temp_nlist, mem_size, nloc, nall);
     DPErrcheck(cudaGetLastError());
     DPErrcheck(cudaDeviceSynchronize());
     fill_nlist<<<block_grid, thread_grid>>>(
diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu
index 243ea0507a..f1c3bc01e2 100644
--- a/source/lib/src/rocm/neighbor_list.hip.cu
+++ b/source/lib/src/rocm/neighbor_list.hip.cu
@@ -2,6 +2,69 @@
 #include "device.h"
 #include "neighbor_list.h"
 
+#include "hipcub/hipcub.hpp"
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+struct parallel_prefix_scan_op
+{
+  // Running prefix
+  int running_total;
+  // Constructor
+  __device__ parallel_prefix_scan_op(int running_total) : running_total(running_total) {}
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+  __device__ int operator()(int block_aggregate)
+  {
+    int old_prefix = running_total;
+    running_total += block_aggregate;
+    return old_prefix;
+  }
+};
+
+template <
+  int   THREADS_PER_BLOCK>
+__global__ void parallel_prefix_scan(
+  int * numneigh, 
+  int * nei_order, 
+  const int * temp_nlist, 
+  const int mem_size, 
+  const int nloc,
+  const int nall
+)
+{
+  // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+  typedef hipcub::BlockScan<int,  THREADS_PER_BLOCK> BlockScan;
+  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // Initialize running total
+  parallel_prefix_scan_op prefix_op(0);
+
+  // Have the block iterate over segments of items
+  for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK)
+  {
+    int block_offset = blockIdx.x * mem_size; 
+    // Load a segment of consecutive items that are blocked across threads
+    int i_data = temp_nlist[block_offset + ii];
+    int o_data = i_data == -1 ? 0 : 1;
+
+    // Collectively compute the block-wide exclusive prefix sum
+    BlockScan(temp_storage).ExclusiveSum(
+        o_data, o_data, prefix_op);
+
+    __syncthreads();
+    // Store scanned items to output segment
+    if (i_data != -1) {
+        nei_order[block_offset + ii] = o_data; 
+    }
+    // Store numneigh into the output array
+    if (ii == nall - 1) {
+        o_data += i_data == -1 ? 0 : 1;
+        numneigh[blockIdx.x] = o_data; 
+    }
+  }
+}
+
 template<typename FPTYPE>
 __device__ inline FPTYPE dev_dot(
     FPTYPE * arr1, 
@@ -45,29 +108,6 @@ __global__ void build_nlist(
     }
 }
 
-__global__ void scan_nlist(
-    int * numneigh, 
-    int * nei_order, 
-    const int * temp_nlist, 
-    const int mem_size, 
-    const int nloc,
-    const int nall)
-{
-    const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if(atom_idx<nloc){
-        const int * row_nlist = temp_nlist + atom_idx * mem_size;
-        int * row_order = nei_order + atom_idx * mem_size;
-        int nei_num=0;
-        for(int i=0;i<nall;i++){
-            if(row_nlist[i]!=-1){
-                row_order[i]=nei_num;
-                nei_num++;
-            }
-        }
-        numneigh[atom_idx]=nei_num;
-    }
-}
-
 __global__ void fill_nlist(
     int ** firstneigh,
     const int * temp_nlist,
@@ -143,14 +183,10 @@ int build_nlist_gpu_rocm(
                 mem_size);
     DPErrcheck(hipGetLastError());
     DPErrcheck(hipDeviceSynchronize());
-    const int nblock_ = (nloc+TPB-1)/TPB;
-    hipLaunchKernelGGL(scan_nlist, nblock_, TPB, 0, 0, 
-                numneigh, 
-                nei_order, 
-                temp_nlist, 
-                mem_size, 
-                nloc, 
-                nall);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(parallel_prefix_scan<TPB>), nloc, TPB, 0, 0, 
+        numneigh, nei_order, 
+        temp_nlist, mem_size, nloc, nall);
     DPErrcheck(hipGetLastError());
     DPErrcheck(hipDeviceSynchronize());
     hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0,