Skip to content

Commit

Permalink
Improve tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 committed Sep 24, 2024
1 parent f857e27 commit d02f3be
Showing 1 changed file with 84 additions and 47 deletions.
131 changes: 84 additions & 47 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,19 +83,20 @@ __global__ void add_kernel_3(int *out, int *in) {
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void add_kernel_4(int *out, int *in) {
__global__ void add_kernel_4(int *out, int *in, int N) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < 5) {
if (index < N) {
int sum = 0;
// Each thread sums elements in steps of warpSize
for (int i = index; i < 5; i += warpSize) {
for (int i = index; i < N; i += warpSize) {
sum += in[i];
}
out[index] = sum;
}
}

// CHECK: void add_kernel_4_grad(int *out, int *in, int *_d_out, int *_d_in) {
// CHECK: void add_kernel_4_grad_0_1(int *out, int *in, int N, int *_d_out, int *_d_in) {
//CHECK-NEXT: int _d_N = 0;
//CHECK-NEXT: bool _cond0;
//CHECK-NEXT: int _d_sum = 0;
//CHECK-NEXT: int sum = 0;
Expand All @@ -110,13 +111,13 @@ __global__ void add_kernel_4(int *out, int *in) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: _cond0 = index0 < 5;
//CHECK-NEXT: _cond0 = index0 < N;
//CHECK-NEXT: if (_cond0) {
//CHECK-NEXT: sum = 0;
//CHECK-NEXT: _t2 = 0UL;
//CHECK-NEXT: for (i = index0; ; clad::push(_t3, i) , (i += warpSize)) {
//CHECK-NEXT: {
//CHECK-NEXT: if (!(i < 5))
//CHECK-NEXT: if (!(i < N))
//CHECK-NEXT: break;
//CHECK-NEXT: }
//CHECK-NEXT: _t2++;
Expand Down Expand Up @@ -155,23 +156,24 @@ __global__ void add_kernel_4(int *out, int *in) {
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void add_kernel_5(int *out, int *in) {
__global__ void add_kernel_5(int *out, int *in, int N) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < 5) {
if (index < N) {
int sum = 0;
// Calculate the total number of threads in the grid
int totalThreads = blockDim.x * gridDim.x;
// Each thread sums elements in steps of the total number of threads in the grid
for (int i = index; i < 5; i += totalThreads) {
for (int i = index; i < N; i += totalThreads) {
sum += in[i];
}
out[index] = sum;
}
}

// CHECK: void add_kernel_5_grad(int *out, int *in, int *_d_out, int *_d_in) {
// CHECK: void add_kernel_5_grad_0_1(int *out, int *in, int N, int *_d_out, int *_d_in) {
//CHECK-NEXT: int _d_N = 0;
//CHECK-NEXT: bool _cond0;
//CHECK-NEXT: int _d_sum = 0;
//CHECK-NEXT: int _d_sum = 0;
//CHECK-NEXT: int sum = 0;
//CHECK-NEXT: unsigned int _t2;
//CHECK-NEXT: unsigned int _t3;
Expand All @@ -188,7 +190,7 @@ __global__ void add_kernel_5(int *out, int *in) {
//CHECK-NEXT: int _d_index = 0;
//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0;
//CHECK-NEXT: {
//CHECK-NEXT: _cond0 = index0 < 5;
//CHECK-NEXT: _cond0 = index0 < N;
//CHECK-NEXT: if (_cond0) {
//CHECK-NEXT: sum = 0;
//CHECK-NEXT: _t3 = blockDim.x;
Expand All @@ -197,7 +199,7 @@ __global__ void add_kernel_5(int *out, int *in) {
//CHECK-NEXT: _t4 = 0UL;
//CHECK-NEXT: for (i = index0; ; clad::push(_t5, i) , (i += totalThreads)) {
//CHECK-NEXT: {
//CHECK-NEXT: if (!(i < 5))
//CHECK-NEXT: if (!(i < N))
//CHECK-NEXT: break;
//CHECK-NEXT: }
//CHECK-NEXT: _t4++;
Expand Down Expand Up @@ -335,39 +337,74 @@ __global__ void add_kernel_6(double *a, double *b) {
free(res); \
}

#define TEST_2_N(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
int *zeros = (int*)malloc(N * sizeof(int)); \
for(int i = 0; i < N; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F, args); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, N, dy, dx); \
} \
else { \
test.execute_kernel(grid, block, y, x, N, dy, dx); \
} \
cudaDeviceSynchronize(); \
int *res = (int*)malloc(N * sizeof(int)); \
cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%d, ", res[i]); \
} \
printf("%d\n", res[N-1]); \
free(fives); \
free(zeros); \
free(res); \
}

#define TEST_2_D(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \
{ \
double *fives = (double*)malloc(N * sizeof(double)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
double *zeros = (double*)malloc(N * sizeof(double)); \
for(int i = 0; i < N; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F, args); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \
} \
else { \
test.execute_kernel(grid, block, y, x, dy, dx); \
} \
cudaDeviceSynchronize(); \
double *res = (double*)malloc(N * sizeof(double)); \
cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%0.2f, ", res[i]); \
} \
printf("%0.2f\n", res[N-1]); \
free(fives); \
free(zeros); \
free(res); \
{ \
double *fives = (double*)malloc(N * sizeof(double)); \
for(int i = 0; i < N; i++) { \
fives[i] = 5; \
} \
double *zeros = (double*)malloc(N * sizeof(double)); \
for(int i = 0; i < N; i++) { \
zeros[i] = 0; \
} \
cudaMemcpy(x, fives, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(y, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(dy, fives, N * sizeof(double), cudaMemcpyHostToDevice); \
cudaMemcpy(dx, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \
auto test = clad::gradient(F, args); \
if constexpr (use_stream) { \
cudaStream_t cudaStream; \
cudaStreamCreate(&cudaStream); \
test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \
} \
else { \
test.execute_kernel(grid, block, y, x, dy, dx); \
} \
cudaDeviceSynchronize(); \
double *res = (double*)malloc(N * sizeof(double)); \
cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \
for (int i = 0; i < (N - 1); i++) { \
printf("%0.2f, ", res[i]); \
} \
printf("%0.2f\n", res[N-1]); \
free(fives); \
free(zeros); \
free(res); \
}


Expand Down Expand Up @@ -398,8 +435,8 @@ int main(void) {
TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2_N(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2_N(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5

cudaFree(dummy_in);
cudaFree(dummy_out);
Expand Down

0 comments on commit d02f3be

Please sign in to comment.