-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathreduce.cu
64 lines (45 loc) · 1.41 KB
/
reduce.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// Sean Baxter's GTC talk companion source.
// http://nvlabs.github.io/moderngpu/
// Reduce example. This data decomposition assigns VT strided values to each
// thread.
#include "common.cuh"
template<int NT, int VT, typename T>
__global__ void KernelReduceSimple(const T* data_global, T* reduce_global) {
typedef CTAReduce<NT, T> Reduce;
__shared__ typename Reduce::Storage reduce_shared;
int tid = threadIdx.x;
// Load VT elements per thread in strided order.
// Pipelining loads from DRAM helps achieve peak bandwidth.
T x[VT];
#pragma unroll
for(int i = 0; i < VT; ++i)
x[i] = data_global[NT * i + tid];
// Reduce the elements within the thread. This requires commutativity.
T sum = x[0];
#pragma unroll
for(int i = 1; i < VT; ++i)
sum += x[i];
// Reduce the elements across the CTA.
sum = Reduce::Reduce(tid, sum, reduce_shared);
// Thread 0 stores the result to DRAM.
if(!tid)
reduce_global[0] = sum;
}
int main(int argc, char** argv) {
const int NT = 128;
const int VT = 8;
const int NV = NT * VT;
typedef int T;
std::vector<T> hostData(NV, (T)1);
T* data_global;
cudaMalloc2(&data_global, hostData);
T* reduce_global;
cudaMalloc2(&reduce_global, 1);
KernelReduceSimple<NT, VT><<<1, NT>>>(data_global, reduce_global);
T total;
copyDtoH(&total, reduce_global, 1);
cudaFree(reduce_global);
cudaFree(data_global);
printf("Reduction of %dx%d block: %d\n", NT, VT, total);
return 0;
}