From 5d028c4df824685b091951e79aa2703aa035c247 Mon Sep 17 00:00:00 2001 From: Denghui Lu Date: Thu, 26 Aug 2021 07:43:12 +0800 Subject: [PATCH] speedup scan_nlist kernel (#1028) * speedup cuda kernel scan_nlist * fix no-pbc error --- source/lib/src/cuda/neighbor_list.cu | 97 +++++++++++++++-------- source/lib/src/rocm/neighbor_list.hip.cu | 98 ++++++++++++++++-------- 2 files changed, 133 insertions(+), 62 deletions(-) diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu index 66bd122079..430c869f4e 100644 --- a/source/lib/src/cuda/neighbor_list.cu +++ b/source/lib/src/cuda/neighbor_list.cu @@ -2,6 +2,69 @@ #include "gpu_cuda.h" #include "neighbor_list.h" +#include +// A stateful callback functor that maintains a running prefix to be applied +// during consecutive scan operations. +struct parallel_prefix_scan_op +{ + // Running prefix + int running_total; + // Constructor + __device__ parallel_prefix_scan_op(int running_total) : running_total(running_total) {} + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide scan. + __device__ int operator()(int block_aggregate) + { + int old_prefix = running_total; + running_total += block_aggregate; + return old_prefix; + } +}; + +template < + int THREADS_PER_BLOCK> +__global__ void parallel_prefix_scan( + int * numneigh, + int * nei_order, + const int * temp_nlist, + const int mem_size, + const int nloc, + const int nall +) +{ + // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + typedef cub::BlockScan BlockScan; + // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + + // Initialize running total + parallel_prefix_scan_op prefix_op(0); + + // Have the block iterate over segments of items + for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK) + { + int block_offset = blockIdx.x * mem_size; + // Load a segment of consecutive items that are blocked across threads + int i_data = temp_nlist[block_offset + ii]; + int o_data = i_data == -1 ? 0 : 1; + + // Collectively compute the block-wide exclusive prefix sum + BlockScan(temp_storage).ExclusiveSum( + o_data, o_data, prefix_op); + + __syncthreads(); + // Store scanned items to output segment + if (i_data != -1) { + nei_order[block_offset + ii] = o_data; + } + // Store numneigh into the output array + if (ii == nall - 1) { + o_data += i_data == -1 ? 0 : 1; + numneigh[blockIdx.x] = o_data; + } + } +} + template __device__ inline FPTYPE dev_dot( FPTYPE * arr1, @@ -45,29 +108,6 @@ __global__ void build_nlist( } } -__global__ void scan_nlist( - int * numneigh, - int * nei_order, - const int * temp_nlist, - const int mem_size, - const int nloc, - const int nall) -{ - const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x; - if(atom_idx>>( - numneigh, - nei_order, - temp_nlist, - mem_size, - nloc, - nall); + parallel_prefix_scan <<>>( + numneigh, nei_order, + temp_nlist, mem_size, nloc, nall); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); fill_nlist<<>>( diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu index 243ea0507a..f1c3bc01e2 100644 --- a/source/lib/src/rocm/neighbor_list.hip.cu +++ b/source/lib/src/rocm/neighbor_list.hip.cu @@ -2,6 +2,69 @@ #include "device.h" #include "neighbor_list.h" +#include "hipcub/hipcub.hpp" +// A stateful callback functor that maintains a running prefix to be applied +// during consecutive scan operations. +struct parallel_prefix_scan_op +{ + // Running prefix + int running_total; + // Constructor + __device__ parallel_prefix_scan_op(int running_total) : running_total(running_total) {} + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide scan. + __device__ int operator()(int block_aggregate) + { + int old_prefix = running_total; + running_total += block_aggregate; + return old_prefix; + } +}; + +template < + int THREADS_PER_BLOCK> +__global__ void parallel_prefix_scan( + int * numneigh, + int * nei_order, + const int * temp_nlist, + const int mem_size, + const int nloc, + const int nall +) +{ + // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + typedef hipcub::BlockScan BlockScan; + // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + + // Initialize running total + parallel_prefix_scan_op prefix_op(0); + + // Have the block iterate over segments of items + for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK) + { + int block_offset = blockIdx.x * mem_size; + // Load a segment of consecutive items that are blocked across threads + int i_data = temp_nlist[block_offset + ii]; + int o_data = i_data == -1 ? 0 : 1; + + // Collectively compute the block-wide exclusive prefix sum + BlockScan(temp_storage).ExclusiveSum( + o_data, o_data, prefix_op); + + __syncthreads(); + // Store scanned items to output segment + if (i_data != -1) { + nei_order[block_offset + ii] = o_data; + } + // Store numneigh into the output array + if (ii == nall - 1) { + o_data += i_data == -1 ? 0 : 1; + numneigh[blockIdx.x] = o_data; + } + } +} + template __device__ inline FPTYPE dev_dot( FPTYPE * arr1, @@ -45,29 +108,6 @@ __global__ void build_nlist( } } -__global__ void scan_nlist( - int * numneigh, - int * nei_order, - const int * temp_nlist, - const int mem_size, - const int nloc, - const int nall) -{ - const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x; - if(atom_idx), nloc, TPB, 0, 0, + numneigh, nei_order, + temp_nlist, mem_size, nloc, nall); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0,