Skip to content

Commit

Permalink
speedup scan_nlist kernel (deepmodeling#1028)
Browse files Browse the repository at this point in the history
* speedup cuda kernel scan_nlist

* fix no-pbc error
  • Loading branch information
denghuilu authored Aug 25, 2021
1 parent 196b1dc commit 5d028c4
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 62 deletions.
97 changes: 66 additions & 31 deletions source/lib/src/cuda/neighbor_list.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,69 @@
#include "gpu_cuda.h"
#include "neighbor_list.h"

#include <cub/block/block_scan.cuh>
// A stateful callback functor that maintains a running prefix to be applied
// during consecutive scan operations.
struct parallel_prefix_scan_op
{
// Running prefix
int running_total;
// Constructor
__device__ parallel_prefix_scan_op(int running_total) : running_total(running_total) {}
// Callback operator to be entered by the first warp of threads in the block.
// Thread-0 is responsible for returning a value for seeding the block-wide scan.
__device__ int operator()(int block_aggregate)
{
int old_prefix = running_total;
running_total += block_aggregate;
return old_prefix;
}
};

template <
int THREADS_PER_BLOCK>
__global__ void parallel_prefix_scan(
int * numneigh,
int * nei_order,
const int * temp_nlist,
const int mem_size,
const int nloc,
const int nall
)
{
// Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
typedef cub::BlockScan<int, THREADS_PER_BLOCK> BlockScan;
// Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;

// Initialize running total
parallel_prefix_scan_op prefix_op(0);

// Have the block iterate over segments of items
for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK)
{
int block_offset = blockIdx.x * mem_size;
// Load a segment of consecutive items that are blocked across threads
int i_data = temp_nlist[block_offset + ii];
int o_data = i_data == -1 ? 0 : 1;

// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).ExclusiveSum(
o_data, o_data, prefix_op);

__syncthreads();
// Store scanned items to output segment
if (i_data != -1) {
nei_order[block_offset + ii] = o_data;
}
// Store numneigh into the output array
if (ii == nall - 1) {
o_data += i_data == -1 ? 0 : 1;
numneigh[blockIdx.x] = o_data;
}
}
}

template<typename FPTYPE>
__device__ inline FPTYPE dev_dot(
FPTYPE * arr1,
Expand Down Expand Up @@ -45,29 +108,6 @@ __global__ void build_nlist(
}
}

__global__ void scan_nlist(
int * numneigh,
int * nei_order,
const int * temp_nlist,
const int mem_size,
const int nloc,
const int nall)
{
const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
if(atom_idx<nloc){
const int * row_nlist = temp_nlist + atom_idx * mem_size;
int * row_order = nei_order + atom_idx * mem_size;
int nei_num=0;
for(int i=0;i<nall;i++){
if(row_nlist[i]!=-1){
row_order[i]=nei_num;
nei_num++;
}
}
numneigh[atom_idx]=nei_num;
}
}

__global__ void fill_nlist(
int ** firstneigh,
const int * temp_nlist,
Expand Down Expand Up @@ -143,14 +183,9 @@ int build_nlist_gpu(
mem_size);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int nblock_ = (nloc+TPB-1)/TPB;
scan_nlist<<<nblock_, TPB>>>(
numneigh,
nei_order,
temp_nlist,
mem_size,
nloc,
nall);
parallel_prefix_scan<TPB> <<<nloc, TPB>>>(
numneigh, nei_order,
temp_nlist, mem_size, nloc, nall);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
fill_nlist<<<block_grid, thread_grid>>>(
Expand Down
98 changes: 67 additions & 31 deletions source/lib/src/rocm/neighbor_list.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,69 @@
#include "device.h"
#include "neighbor_list.h"

#include "hipcub/hipcub.hpp"
// A stateful callback functor that maintains a running prefix to be applied
// during consecutive scan operations.
struct parallel_prefix_scan_op
{
// Running prefix
int running_total;
// Constructor
__device__ parallel_prefix_scan_op(int running_total) : running_total(running_total) {}
// Callback operator to be entered by the first warp of threads in the block.
// Thread-0 is responsible for returning a value for seeding the block-wide scan.
__device__ int operator()(int block_aggregate)
{
int old_prefix = running_total;
running_total += block_aggregate;
return old_prefix;
}
};

template <
int THREADS_PER_BLOCK>
__global__ void parallel_prefix_scan(
int * numneigh,
int * nei_order,
const int * temp_nlist,
const int mem_size,
const int nloc,
const int nall
)
{
// Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
typedef hipcub::BlockScan<int, THREADS_PER_BLOCK> BlockScan;
// Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;

// Initialize running total
parallel_prefix_scan_op prefix_op(0);

// Have the block iterate over segments of items
for (int ii = threadIdx.x; ii < nall; ii += THREADS_PER_BLOCK)
{
int block_offset = blockIdx.x * mem_size;
// Load a segment of consecutive items that are blocked across threads
int i_data = temp_nlist[block_offset + ii];
int o_data = i_data == -1 ? 0 : 1;

// Collectively compute the block-wide exclusive prefix sum
BlockScan(temp_storage).ExclusiveSum(
o_data, o_data, prefix_op);

__syncthreads();
// Store scanned items to output segment
if (i_data != -1) {
nei_order[block_offset + ii] = o_data;
}
// Store numneigh into the output array
if (ii == nall - 1) {
o_data += i_data == -1 ? 0 : 1;
numneigh[blockIdx.x] = o_data;
}
}
}

template<typename FPTYPE>
__device__ inline FPTYPE dev_dot(
FPTYPE * arr1,
Expand Down Expand Up @@ -45,29 +108,6 @@ __global__ void build_nlist(
}
}

__global__ void scan_nlist(
int * numneigh,
int * nei_order,
const int * temp_nlist,
const int mem_size,
const int nloc,
const int nall)
{
const unsigned int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
if(atom_idx<nloc){
const int * row_nlist = temp_nlist + atom_idx * mem_size;
int * row_order = nei_order + atom_idx * mem_size;
int nei_num=0;
for(int i=0;i<nall;i++){
if(row_nlist[i]!=-1){
row_order[i]=nei_num;
nei_num++;
}
}
numneigh[atom_idx]=nei_num;
}
}

__global__ void fill_nlist(
int ** firstneigh,
const int * temp_nlist,
Expand Down Expand Up @@ -143,14 +183,10 @@ int build_nlist_gpu_rocm(
mem_size);
DPErrcheck(hipGetLastError());
DPErrcheck(hipDeviceSynchronize());
const int nblock_ = (nloc+TPB-1)/TPB;
hipLaunchKernelGGL(scan_nlist, nblock_, TPB, 0, 0,
numneigh,
nei_order,
temp_nlist,
mem_size,
nloc,
nall);
hipLaunchKernelGGL(
HIP_KERNEL_NAME(parallel_prefix_scan<TPB>), nloc, TPB, 0, 0,
numneigh, nei_order,
temp_nlist, mem_size, nloc, nall);
DPErrcheck(hipGetLastError());
DPErrcheck(hipDeviceSynchronize());
hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0,
Expand Down

0 comments on commit 5d028c4

Please sign in to comment.