From 6d6f85371d9ad7048fa3abb989ee33861c89973e Mon Sep 17 00:00:00 2001 From: kchristin Date: Tue, 24 Sep 2024 12:25:37 +0300 Subject: [PATCH] Add atomic double test and fix tests --- lib/Differentiator/ReverseModeVisitor.cpp | 19 +-- test/CMakeLists.txt | 2 +- test/CUDA/GradientKernels.cu | 154 ++++++++++++++++++---- 3 files changed, 139 insertions(+), 36 deletions(-) diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index edfa323bf..f24a657f0 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -1493,14 +1493,10 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, m_Context.getTranslationUnitDecl()); FunctionDecl* atomicAddFunc = nullptr; - for (LookupResult::iterator it = lookupResult.begin(); - it != lookupResult.end(); it++) { - NamedDecl* decl = *it; + for (auto decl : lookupResult) { // FIXME: check for underlying types of the pointers if (dyn_cast(decl)->getReturnType() == result->getType()) { - printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(), - result->getType().getAsString().c_str()); atomicAddFunc = dyn_cast(decl); break; } @@ -2312,21 +2308,20 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd"); LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(), Sema::LookupOrdinaryName); - m_Sema.LookupQualifiedName(lookupResult, m_Context.getTranslationUnitDecl()); + m_Sema.LookupQualifiedName(lookupResult, + m_Context.getTranslationUnitDecl()); FunctionDecl* atomicAddFunc = nullptr; - for (LookupResult::iterator it = lookupResult.begin(); - it != lookupResult.end(); it++) { - NamedDecl* decl = *it; + for (auto decl : lookupResult) { // FIXME: check for underlying types of the pointers - if (dyn_cast(decl)->getReturnType() == derivedE->getType()) { - printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(), derivedE->getType().getAsString().c_str()); + if (dyn_cast(decl)->getReturnType() == + derivedE->getType()) { atomicAddFunc = dyn_cast(decl); break; } } assert(atomicAddFunc && "atomicAdd function not found"); - llvm::SmallVector atomicArgs = {diff_dx, dfdx()}; + llvm::SmallVector atomicArgs = {diff_dx, dfdx()}; Expr* atomicCall = BuildCallExprToFunction(atomicAddFunc, atomicArgs); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a1d17f51d..0264fc4d3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -29,7 +29,7 @@ if(CUDAToolkit_FOUND) get_filename_component(CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE) get_filename_component(CUDA_LIBDIR "${CUDA_cudart_static_LIBRARY}" DIRECTORY) - set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_50") + set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_60") if(TARGET nvptx-arch) get_property(LIBOMPTARGET_NVPTX_ARCH TARGET nvptx-arch PROPERTY LOCATION) diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 725236992..775773401 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -25,7 +25,7 @@ __global__ void kernel(int *a) { //CHECK-NEXT: int _r_d0 = *_d_a; //CHECK-NEXT: *_d_a = 0; //CHECK-NEXT: *_d_a += _r_d0 * *a; -//CHECK-NEXT: *_d_a += *a * _r_d0; +//CHECK-NEXT: atomicAdd(_d_a, *a * _r_d0); //CHECK-NEXT: } //CHECK-NEXT: } @@ -46,7 +46,7 @@ __global__ void add_kernel(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: out[index0] = _t0; //CHECK-NEXT: int _r_d0 = _d_out[index0]; -//CHECK-NEXT: _d_in[index0] += _r_d0; +//CHECK-NEXT: atomicAdd(&_d_in[index0], _r_d0); //CHECK-NEXT: } //CHECK-NEXT: } @@ -60,7 +60,7 @@ __global__ void add_kernel_2(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: out[threadIdx.x] = _t0; //CHECK-NEXT: int _r_d0 = _d_out[threadIdx.x]; -//CHECK-NEXT: _d_in[threadIdx.x] += _r_d0; +//CHECK-NEXT: atomicAdd(&_d_in[threadIdx.x], _r_d0); //CHECK-NEXT: } //CHECK-NEXT: } @@ -79,23 +79,24 @@ __global__ void add_kernel_3(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: out[index0] = _t2; //CHECK-NEXT: int _r_d0 = _d_out[index0]; -//CHECK-NEXT: _d_in[index0] += _r_d0; +//CHECK-NEXT: atomicAdd(&_d_in[index0], _r_d0); //CHECK-NEXT: } //CHECK-NEXT:} -__global__ void add_kernel_4(int *out, int *in) { +__global__ void add_kernel_4(int *out, int *in, int N) { int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < 5) { + if (index < N) { int sum = 0; // Each thread sums elements in steps of warpSize - for (int i = index; i < 5; i += warpSize) { + for (int i = index; i < N; i += warpSize) { sum += in[i]; } out[index] = sum; } } -// CHECK: void add_kernel_4_grad(int *out, int *in, int *_d_out, int *_d_in) { +// CHECK: void add_kernel_4_grad_0_1(int *out, int *in, int N, int *_d_out, int *_d_in) { +//CHECK-NEXT: int _d_N = 0; //CHECK-NEXT: bool _cond0; //CHECK-NEXT: int _d_sum = 0; //CHECK-NEXT: int sum = 0; @@ -110,13 +111,13 @@ __global__ void add_kernel_4(int *out, int *in) { //CHECK-NEXT: int _d_index = 0; //CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; //CHECK-NEXT: { -//CHECK-NEXT: _cond0 = index0 < 5; +//CHECK-NEXT: _cond0 = index0 < N; //CHECK-NEXT: if (_cond0) { //CHECK-NEXT: sum = 0; //CHECK-NEXT: _t2 = 0UL; //CHECK-NEXT: for (i = index0; ; clad::push(_t3, i) , (i += warpSize)) { //CHECK-NEXT: { -//CHECK-NEXT: if (!(i < 5)) +//CHECK-NEXT: if (!(i < N)) //CHECK-NEXT: break; //CHECK-NEXT: } //CHECK-NEXT: _t2++; @@ -147,7 +148,7 @@ __global__ void add_kernel_4(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: sum = clad::pop(_t4); //CHECK-NEXT: int _r_d1 = _d_sum; -//CHECK-NEXT: _d_in[i] += _r_d1; +//CHECK-NEXT: atomicAdd(&_d_in[i], _r_d1); //CHECK-NEXT: } //CHECK-NEXT: } //CHECK-NEXT: _d_index += _d_i; @@ -155,23 +156,24 @@ __global__ void add_kernel_4(int *out, int *in) { //CHECK-NEXT: } //CHECK-NEXT:} -__global__ void add_kernel_5(int *out, int *in) { +__global__ void add_kernel_5(int *out, int *in, int N) { int index = threadIdx.x + blockIdx.x * blockDim.x; - if (index < 5) { + if (index < N) { int sum = 0; // Calculate the total number of threads in the grid int totalThreads = blockDim.x * gridDim.x; // Each thread sums elements in steps of the total number of threads in the grid - for (int i = index; i < 5; i += totalThreads) { + for (int i = index; i < N; i += totalThreads) { sum += in[i]; } out[index] = sum; } } -// CHECK: void add_kernel_5_grad(int *out, int *in, int *_d_out, int *_d_in) { +// CHECK: void add_kernel_5_grad_0_1(int *out, int *in, int N, int *_d_out, int *_d_in) { +//CHECK-NEXT: int _d_N = 0; //CHECK-NEXT: bool _cond0; -//CHECK-NEXT: int _d_sum = 0; +//CHECK-NEXT: int _d_sum = 0; //CHECK-NEXT: int sum = 0; //CHECK-NEXT: unsigned int _t2; //CHECK-NEXT: unsigned int _t3; @@ -188,7 +190,7 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: int _d_index = 0; //CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; //CHECK-NEXT: { -//CHECK-NEXT: _cond0 = index0 < 5; +//CHECK-NEXT: _cond0 = index0 < N; //CHECK-NEXT: if (_cond0) { //CHECK-NEXT: sum = 0; //CHECK-NEXT: _t3 = blockDim.x; @@ -197,7 +199,7 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: _t4 = 0UL; //CHECK-NEXT: for (i = index0; ; clad::push(_t5, i) , (i += totalThreads)) { //CHECK-NEXT: { -//CHECK-NEXT: if (!(i < 5)) +//CHECK-NEXT: if (!(i < N)) //CHECK-NEXT: break; //CHECK-NEXT: } //CHECK-NEXT: _t4++; @@ -229,7 +231,7 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: { //CHECK-NEXT: sum = clad::pop(_t6); //CHECK-NEXT: int _r_d1 = _d_sum; -//CHECK-NEXT: _d_in[i] += _r_d1; +//CHECK-NEXT: atomicAdd(&_d_in[i], _r_d1); //CHECK-NEXT: } //CHECK-NEXT: } //CHECK-NEXT: _d_index += _d_i; @@ -237,12 +239,35 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: } //CHECK-NEXT:} -__global__ void add_kernel_6(int *a, int *b) { +__global__ void add_kernel_6(double *a, double *b) { int index = threadIdx.x + blockIdx.x * blockDim.x; a[2 * index] = b[0]; a[2 * index + 1] = b[0]; } +// CHECK: void add_kernel_6_grad(double *a, double *b, double *_d_a, double *_d_b) { +//CHECK-NEXT: unsigned int _t1 = blockIdx.x; +//CHECK-NEXT: unsigned int _t0 = blockDim.x; +//CHECK-NEXT: int _d_index = 0; +//CHECK-NEXT: int index0 = threadIdx.x + _t1 * _t0; +//CHECK-NEXT: double _t2 = a[2 * index0]; +//CHECK-NEXT: a[2 * index0] = b[0]; +//CHECK-NEXT: double _t3 = a[2 * index0 + 1]; +//CHECK-NEXT: a[2 * index0 + 1] = b[0]; +//CHECK-NEXT: { +//CHECK-NEXT: a[2 * index0 + 1] = _t3; +//CHECK-NEXT: double _r_d1 = _d_a[2 * index0 + 1]; +//CHECK-NEXT: _d_a[2 * index0 + 1] = 0.; +//CHECK-NEXT: atomicAdd(&_d_b[0], _r_d1); +//CHECK-NEXT: } +//CHECK-NEXT: { +//CHECK-NEXT: a[2 * index0] = _t2; +//CHECK-NEXT: double _r_d0 = _d_a[2 * index0]; +//CHECK-NEXT: _d_a[2 * index0] = 0.; +//CHECK-NEXT: atomicAdd(&_d_b[0], _r_d0); +//CHECK-NEXT: } +//CHECK-NEXT:} + #define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \ { \ int *fives = (int*)malloc(N * sizeof(int)); \ @@ -312,6 +337,76 @@ __global__ void add_kernel_6(int *a, int *b) { free(res); \ } +#define TEST_2_N(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \ + { \ + int *fives = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + fives[i] = 5; \ + } \ + int *zeros = (int*)malloc(N * sizeof(int)); \ + for(int i = 0; i < N; i++) { \ + zeros[i] = 0; \ + } \ + cudaMemcpy(x, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(y, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dy, fives, N * sizeof(int), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, zeros, N * sizeof(int), cudaMemcpyHostToDevice); \ + auto test = clad::gradient(F, args); \ + if constexpr (use_stream) { \ + cudaStream_t cudaStream; \ + cudaStreamCreate(&cudaStream); \ + test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, N, dy, dx); \ + } \ + else { \ + test.execute_kernel(grid, block, y, x, N, dy, dx); \ + } \ + cudaDeviceSynchronize(); \ + int *res = (int*)malloc(N * sizeof(int)); \ + cudaMemcpy(res, dx, N * sizeof(int), cudaMemcpyDeviceToHost); \ + for (int i = 0; i < (N - 1); i++) { \ + printf("%d, ", res[i]); \ + } \ + printf("%d\n", res[N-1]); \ + free(fives); \ + free(zeros); \ + free(res); \ + } + +#define TEST_2_D(F, grid, block, shared_mem, use_stream, args, y, x, dy, dx, N) \ + { \ + double *fives = (double*)malloc(N * sizeof(double)); \ + for(int i = 0; i < N; i++) { \ + fives[i] = 5; \ + } \ + double *zeros = (double*)malloc(N * sizeof(double)); \ + for(int i = 0; i < N; i++) { \ + zeros[i] = 0; \ + } \ + cudaMemcpy(x, fives, N * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(y, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(dy, fives, N * sizeof(double), cudaMemcpyHostToDevice); \ + cudaMemcpy(dx, zeros, N * sizeof(double), cudaMemcpyHostToDevice); \ + auto test = clad::gradient(F, args); \ + if constexpr (use_stream) { \ + cudaStream_t cudaStream; \ + cudaStreamCreate(&cudaStream); \ + test.execute_kernel(grid, block, shared_mem, cudaStream, y, x, dy, dx); \ + } \ + else { \ + test.execute_kernel(grid, block, y, x, dy, dx); \ + } \ + cudaDeviceSynchronize(); \ + double *res = (double*)malloc(N * sizeof(double)); \ + cudaMemcpy(res, dx, N * sizeof(double), cudaMemcpyDeviceToHost); \ + for (int i = 0; i < (N - 1); i++) { \ + printf("%0.2f, ", res[i]); \ + } \ + printf("%0.2f\n", res[N-1]); \ + free(fives); \ + free(zeros); \ + free(res); \ + } + int main(void) { int *a, *d_a; @@ -340,14 +435,27 @@ int main(void) { TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 - TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 - TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 - TEST_2(add_kernel_6, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out, dummy_in, d_out, d_in, 10); // CHECK-EXEC: 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + TEST_2_N(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2_N(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 cudaFree(dummy_in); cudaFree(dummy_out); cudaFree(d_out); cudaFree(d_in); + double *dummy_in_double, *dummy_out_double, *d_out_double, *d_in_double; + cudaMalloc(&dummy_in_double, 10 * sizeof(double)); + cudaMalloc(&dummy_out_double, 10 * sizeof(double)); + cudaMalloc(&d_out_double, 10 * sizeof(double)); + cudaMalloc(&d_in_double, 10 * sizeof(double)); + + TEST_2_D(add_kernel_6, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out_double, dummy_in_double, d_out_double, d_in_double, 10); // CHECK-EXEC: 50.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 + + cudaFree(dummy_in_double); + cudaFree(dummy_out_double); + cudaFree(d_out_double); + cudaFree(d_in_double); + + return 0; }