diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 82c60daa1..067690cd1 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -687,6 +687,25 @@ double fn_memory(double *out, double *in) { free(res); \ } +#define INIT(x, y, val, dx, dy, d_val) \ +{ \ + double *fives = (double*)malloc(10 * sizeof(double)); \ + for(int i = 0; i < 10; i++) { \ + fives[i] = 5; \ + } \ + double *zeros = (double*)malloc(10 * sizeof(double)); \ + for(int i = 0; i < 10; i++) { \ + zeros[i] = 0; \ + } \ + cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); \ + free(fives); \ + free(zeros); \ +} int main(void) { int *a, *d_a; @@ -732,121 +751,79 @@ int main(void) { TEST_2_D(add_kernel_7, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 - cudaFree(dummy_in_double); - cudaFree(dummy_out_double); - cudaFree(d_out_double); - cudaFree(d_in_double); - - double *fives = (double*)malloc(10 * sizeof(double)); - for(int i = 0; i < 10; i++) { - fives[i] = 5; - } - double *zeros = (double*)malloc(10 * sizeof(double)); - for(int i = 0; i < 10; i++) { - zeros[i] = 0; - } - - double *x, *y, *dx, *dy, *d_val; - cudaMalloc(&x, 10 * sizeof(double)); - cudaMalloc(&y, 10 * sizeof(double)); - cudaMalloc(&dx, 10 * sizeof(double)); - cudaMalloc(&dy, 10 * sizeof(double)); + double *val; + cudaMalloc(&val, sizeof(double)); + double *d_val; cudaMalloc(&d_val, sizeof(double)); - - cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); + + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_device = clad::gradient(kernel_with_device_call, "out, val"); - test_device.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, d_val); + test_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); double *res = (double*)malloc(10 * sizeof(double)); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 - cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_device_2 = clad::gradient(kernel_with_device_call_2, "out, val"); - test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, d_val); + test_device_2.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_val); cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 - cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_device_3 = clad::gradient(kernel_with_device_call_2, "out, in"); - test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, dx); + test_device_3.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); cudaDeviceSynchronize(); - cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 - cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_val, zeros, sizeof(double), cudaMemcpyHostToDevice); - - double *val; - cudaMalloc(&val, sizeof(double)); - cudaMemcpy(val, fives, sizeof(double), cudaMemcpyHostToDevice); + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_device_4 = clad::gradient(kernel_with_device_call_3); - test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, val, dy, dx, d_val); + test_device_4.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, val, d_out_double, d_in_double, d_val); cudaDeviceSynchronize(); - cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 cudaMemcpy(res, d_val, sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 - cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_kernel_call = clad::gradient(fn); - test_kernel_call.execute(y, x, dy, dx); + test_kernel_call.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); cudaDeviceSynchronize(); - cudaMemcpy(res, dx, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(res, d_in_double, sizeof(double), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); printf("%0.2f\n", *res); // CHECK-EXEC: 50.00 - cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto nested_device = clad::gradient(kernel_with_nested_device_call, "out, in"); - nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), y, x, 5, dy, dx); + nested_device.execute_kernel(dim3(1), dim3(10, 1, 1), dummy_out_double, dummy_in_double, 5, d_out_double, d_in_double); cudaDeviceSynchronize(); - cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 5.00, 5.00, 5.00 - cudaMemcpy(x, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(y, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dx, zeros, 10 * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(dy, fives, 10 * sizeof(double), cudaMemcpyHostToDevice); + INIT(dummy_in_double, dummy_out_double, val, d_in_double, d_out_double, d_val); auto test_memory = clad::gradient(fn_memory); - test_memory.execute(y, x, dy, dx); + test_memory.execute(dummy_out_double, dummy_in_double, d_out_double, d_in_double); cudaDeviceSynchronize(); - cudaMemcpy(res, dx, 10 * sizeof(double), cudaMemcpyDeviceToHost); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); // CHECK-EXEC: no error + cudaMemcpy(res, d_in_double, 10 * sizeof(double), cudaMemcpyDeviceToHost); printf("%0.2f, %0.2f, %0.2f\n", res[0], res[1], res[2]); // CHECK-EXEC: 50.00, 0.00, 0.00 - - free(fives); - free(zeros); + free(res); - cudaFree(dx); - cudaFree(dy); + cudaFree(dummy_in_double); + cudaFree(dummy_out_double); + cudaFree(d_out_double); + cudaFree(d_in_double); + cudaFree(val); cudaFree(d_val); return 0;