-
Notifications
You must be signed in to change notification settings - Fork 5
/
kernelExample.cu
154 lines (126 loc) · 4.59 KB
/
kernelExample.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#include <stdlib.h>
#include <sys/time.h>
#include<stdio.h>
#include<cuda.h>
#include<math.h>
#define SQRT_TWO_PI 2.506628274631000
#define BLOCK_D1 1024
#define BLOCK_D2 1
#define BLOCK_D3 1
// Note: Needs compute capability >= 2.0 for calculation with doubles, so compile with:
// nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
// -use_fast_math doesn't seem to have any effect on speed
// CUDA kernel:
__global__ void calc_loglik(double* vals, int n, double mu, double sigma) {
// note that this assumes no third dimension to the grid
// id of the block
int myblock = blockIdx.x + blockIdx.y * gridDim.x;
// size of each block (within grid of blocks)
int blocksize = blockDim.x * blockDim.y * blockDim.z;
// id of thread in a given block
int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
// assign overall id/index of the thread
int idx = myblock * blocksize + subthread;
if(idx < n) {
double std = (vals[idx] - mu)/sigma;
double e = exp( - 0.5 * std * std);
vals[idx] = e / ( sigma * SQRT_TWO_PI);
}
}
// CPU analog for speed comparison
int calc_loglik_cpu(double* vals, int n, double mu, double sigma) {
double std, e;
for(int idx = 0; idx < n; idx++) {
std = (vals[idx] - mu)/sigma;
e = exp( - 0.5 * std * std);
vals[idx] = e / ( sigma * SQRT_TWO_PI);
}
return 0;
}
/* --------------------------- host code ------------------------------*/
void fill( double *p, int n ) {
int i;
srand48(0);
for( i = 0; i < n; i++ )
p[i] = 2*drand48()-1;
}
double read_timer() {
struct timeval end;
gettimeofday( &end, NULL );
return end.tv_sec+1.e-6*end.tv_usec;
}
int main (int argc, char *argv[]) {
double* cpu_vals;
double* gpu_vals;
int n;
cudaError_t cudaStat;
printf("====================================================\n");
for( n = 32768; n <= 134217728; n*=8 ) {
cpu_vals = (double*) malloc( sizeof(double)*n );
cudaStat = cudaMalloc(&gpu_vals, sizeof(double)*n);
if(cudaStat != cudaSuccess) {
printf ("device memory allocation failed");
return EXIT_FAILURE;
}
// fixed block dimensions (1024x1x1 threads)
const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
// determine number of blocks we need for a given problem size
int tmp = ceil(pow(n/BLOCK_D1, 0.5));
printf("Grid dimension is %i x %i\n", tmp, tmp);
dim3 gridSize(tmp, tmp, 1);
int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*tmp*tmp;
if (nthreads < n){
printf("\n============ NOT ENOUGH THREADS TO COVER n=%d ===============\n\n",n);
} else {
printf("Launching %d threads (n=%d)\n", nthreads, n);
}
double mu = 0.0;
double sigma = 1.0;
// simulate 'data'
fill(cpu_vals, n);
printf("Input values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
cudaDeviceSynchronize();
double tInit = read_timer();
// copy input data to the GPU
cudaStat = cudaMemcpy(gpu_vals, cpu_vals, n*sizeof(double), cudaMemcpyHostToDevice);
printf("Memory Copy from Host to Device ");
if (cudaStat){
printf("failed.\n");
} else {
printf("successful.\n");
}
cudaDeviceSynchronize();
double tTransferToGPU = read_timer();
// do the calculation
calc_loglik<<<gridSize, blockSize>>>(gpu_vals, n, mu, sigma);
cudaDeviceSynchronize();
double tCalc = read_timer();
cudaStat = cudaMemcpy(cpu_vals, gpu_vals, n, cudaMemcpyDeviceToHost);
printf("Memory Copy from Device to Host ");
if (cudaStat){
printf("failed.\n");
} else {
printf("successful.\n");
}
cudaDeviceSynchronize();
double tTransferFromGPU = read_timer();
printf("Output values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
// do calculation on CPU for comparison (unfair as this will only use one core)
fill(cpu_vals, n);
double tInit2 = read_timer();
calc_loglik_cpu(cpu_vals, n, mu, sigma);
double tCalcCPU = read_timer();
printf("Output values (CPU): %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
printf("Timing results for n = %d\n", n);
printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit);
printf("Calculation time (GPU): %f\n", tCalc - tTransferToGPU);
printf("Calculation time (CPU): %f\n", tCalcCPU - tInit2);
printf("Transfer from GPU time: %f\n", tTransferFromGPU - tCalc);
printf("Freeing memory...\n");
printf("====================================================\n");
free(cpu_vals);
cudaFree(gpu_vals);
}
printf("\n\nFinished.\n\n");
return 0;
}