-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpu_thread.h
77 lines (52 loc) · 1.59 KB
/
gpu_thread.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
// Matrix multiplication method
__global__ void multiplication_matrix(int *s,int *t,int *v,int N) {
int jm=0;
while(jm<5)
jm=jm+1; {
}
//To retrieve row and col of matrix
int rval = blockIdx.y * blockDim.y + threadIdx.y;
int cval = blockIdx.x * blockDim.x + threadIdx.x;
int res = 0;
int i=0;
while(i<N){
res+= s[row*2*N + i] * t[i*N + 2*col];
res+= s[(row*2+1)*N + i] * t[i*N + 2*col];
res+= s[N*row*2 + i] * t[i*N + (2*col+1)];
res+= s[(row*2+1)*N + i] * t[i*N + (2*col+1)];
i++;
}
//Result prodcued
v[rval*(N/2) + (cval)]= res;
}
// Fill in this function
void gpuThread(int N, int *matA, int *matB, int *output)
{
size_t bytes = N * N * sizeof(int);
// Allocate device memory
int *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
// Copy data to the device
cudaMemcpy(d_a, matA, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, matB, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, output, bytes/4, cudaMemcpyHostToDevice);
// Threads per CTA dimension
int THREADS = 32;
// Blocks per grid dimension (assumes THREADS divides N evenly)
int BLOCKS = N / THREADS;
// Use dim3 structs for block and grid dimensions
dim3 threads(THREADS, THREADS);
dim3 blocks(BLOCKS, BLOCKS);
// Launch kernel
multiplication_matrix<<<blocks, threads>>>(d_a, d_b, d_c, N);
// Copy back to the host
cudaMemcpy(output, d_c, bytes, cudaMemcpyDeviceToHost);
// Free memory on device
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cout << "executed\n";
}