-
Notifications
You must be signed in to change notification settings - Fork 5
/
helloWorld.cu
75 lines (62 loc) · 2.65 KB
/
helloWorld.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
// Note: Needs compute capability >= 2.0, so compile with:
// nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
// number of computations:
#define N 20000
// constants for grid and block sizes
#define GRID_D1 20
#define GRID_D2 2
#define BLOCK_D1 512
#define BLOCK_D2 1
#define BLOCK_D3 1
// this is the kernel function called for each thread
// we use the CUDA variables {threadIdx, blockIdx, blockDim, gridDim} to determine a unique ID for each thread
__global__ void hello(void)
{
// id of the block
int myblock = blockIdx.x + blockIdx.y * gridDim.x;
// size of each block (within grid of blocks)
int blocksize = blockDim.x * blockDim.y * blockDim.z;
// id of thread in a given block
int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
// assign overall id/index of the thread
int idx = myblock * blocksize + subthread;
if(idx < 2000 || idx > 19000) {
// print buffer from within the kernel is limited so only print for first and last chunks of threads
if (idx < N){
printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => \
thread index=%d\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx);
} else {
printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => \
thread index=%d [### this thread would not be used for N=%d ###]\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y,
threadIdx.x, threadIdx.y, threadIdx.z, idx, N);
}
}
}
int main(int argc,char **argv)
{
// objects containing the block and grid info
const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
const dim3 gridSize(GRID_D1, GRID_D2, 1);
int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*GRID_D1*GRID_D2;
if (nthreads < N){
printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N);
} else {
printf("Launching %d threads (N=%d)\n",nthreads,N);
}
// launch the kernel on the specified grid of thread blocks
hello<<<gridSize, blockSize>>>();
// Need to flush prints, otherwise none of the prints from within the kernel will show up
// as program exit does not flush the print buffer.
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr){
printf("kernel launch failed with error \"%s\".\n",
cudaGetErrorString(cudaerr));
} else {
printf("kernel launch success!\n");
}
printf("That's all!\n");
return 0;
}