-
Notifications
You must be signed in to change notification settings - Fork 5
/
pi.cu
100 lines (77 loc) · 2.91 KB
/
pi.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#include <iostream>
#include <vector>
using namespace std;
// Create a kernel to estimate pi
__global__
void count_samples_in_circles(float* d_randNumsX, float* d_randNumsY, int* d_countInBlocks, int num_blocks, int nsamples)
{
__shared__ int shared_blocks[500];
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * num_blocks;
// Iterates through
int inCircle = 0;
for (int i = index; i < nsamples; i+= stride) {
float xValue = d_randNumsX[i];
float yValue = d_randNumsY[i];
if (xValue*xValue + yValue*yValue <= 1.0f) {
inCircle++;
}
}
shared_blocks[threadIdx.x] = inCircle;
__syncthreads();
// Pick thread 0 for each block to collect all points from each Thread.
if (threadIdx.x == 0)
{
int totalInCircleForABlock = 0;
for (int j = 0; j < blockDim.x; j++)
{
totalInCircleForABlock += shared_blocks[j];
}
d_countInBlocks[blockIdx.x] = totalInCircleForABlock;
}
}
int nsamples = 1e8;
int main(void) {
// allocate space to hold random values
vector<float> h_randNumsX(nsamples);
vector<float> h_randNumsY(nsamples);
srand(time(NULL)); // seed with system clock
//Initialize vector with random values
for (int i = 0; i < h_randNumsX.size(); ++i)
{
h_randNumsX[i] = float(rand()) / RAND_MAX;
h_randNumsY[i] = float(rand()) / RAND_MAX;
}
// Send random values to the GPU
size_t size = nsamples * sizeof(float);
float* d_randNumsX;
float* d_randNumsY;
cudaMalloc(&d_randNumsX, size);
cudaMalloc(&d_randNumsY, size);
cudaMemcpy(d_randNumsX, &h_randNumsX.front(), size, cudaMemcpyHostToDevice);
cudaMemcpy(d_randNumsY, &h_randNumsY.front(), size, cudaMemcpyHostToDevice);
// Launch kernel to count samples that fell inside unit circle
int threadsPerBlock = 500;
int num_blocks = nsamples / (1000 * threadsPerBlock);
size_t countBlocks = num_blocks * sizeof(int);
int* d_countInBlocks;
cudaMalloc(&d_countInBlocks, countBlocks);
// CALL KERNEL
count_samples_in_circles<<<num_blocks, threadsPerBlock>>>(d_randNumsX, d_randNumsY, d_countInBlocks, num_blocks, nsamples);
if ( cudaSuccess != cudaGetLastError() )
cout << "Error!\n";
// Return back the vector from device to host
int* h_countInBlocks = new int[num_blocks];
cudaMemcpy(h_countInBlocks, d_countInBlocks, countBlocks, cudaMemcpyDeviceToHost);
int nsamples_in_circle = 0;
for (int i = 0 ; i < num_blocks; i++) {
//cout << "Value in block " + i << " is " << h_countInBlocks[i] << endl;
nsamples_in_circle = nsamples_in_circle + h_countInBlocks[i];
}
cudaFree(d_randNumsX);
cudaFree(d_randNumsY);
cudaFree(d_countInBlocks);
// fraction that fell within (quarter) of unit circle
float estimatedValue = 4.0 * float(nsamples_in_circle) / nsamples;
cout << "Estimated Value: " << estimatedValue << endl;
}