diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp index 4d600caf7..edfa323bf 100644 --- a/lib/Differentiator/ReverseModeVisitor.cpp +++ b/lib/Differentiator/ReverseModeVisitor.cpp @@ -1485,9 +1485,38 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, BuildArraySubscript(target, forwSweepDerivativeIndices); // Create the (target += dfdx) statement. if (dfdx()) { - auto* add_assign = BuildOp(BO_AddAssign, result, dfdx()); - // Add it to the body statements. - addToCurrentBlock(add_assign, direction::reverse); + if (m_DiffReq->hasAttr()) { + DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd"); + LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(), + Sema::LookupOrdinaryName); + m_Sema.LookupQualifiedName(lookupResult, + m_Context.getTranslationUnitDecl()); + + FunctionDecl* atomicAddFunc = nullptr; + for (LookupResult::iterator it = lookupResult.begin(); + it != lookupResult.end(); it++) { + NamedDecl* decl = *it; + // FIXME: check for underlying types of the pointers + if (dyn_cast(decl)->getReturnType() == + result->getType()) { + printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(), + result->getType().getAsString().c_str()); + atomicAddFunc = dyn_cast(decl); + break; + } + } + assert(atomicAddFunc && "atomicAdd function not found"); + Expr* addrOfRes = BuildOp(UnaryOperatorKind::UO_AddrOf, result); + llvm::SmallVector atomicArgs = {addrOfRes, dfdx()}; + Expr* atomicCall = BuildCallExprToFunction(atomicAddFunc, atomicArgs); + + // Add it to the body statements. + addToCurrentBlock(atomicCall, direction::reverse); + } else { + auto* add_assign = BuildOp(BO_AddAssign, result, dfdx()); + // Add it to the body statements. + addToCurrentBlock(add_assign, direction::reverse); + } } if (m_ExternalSource) m_ExternalSource->ActAfterProcessingArraySubscriptExpr(valueForRevSweep); @@ -2279,9 +2308,35 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context, derivedE = BuildOp(UnaryOperatorKind::UO_Deref, diff_dx); // Create the (target += dfdx) statement. if (dfdx()) { - auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx()); - // Add it to the body statements. - addToCurrentBlock(add_assign, direction::reverse); + if (m_DiffReq->hasAttr()) { + DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd"); + LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(), + Sema::LookupOrdinaryName); + m_Sema.LookupQualifiedName(lookupResult, m_Context.getTranslationUnitDecl()); + + FunctionDecl* atomicAddFunc = nullptr; + for (LookupResult::iterator it = lookupResult.begin(); + it != lookupResult.end(); it++) { + NamedDecl* decl = *it; + // FIXME: check for underlying types of the pointers + if (dyn_cast(decl)->getReturnType() == derivedE->getType()) { + printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(), derivedE->getType().getAsString().c_str()); + atomicAddFunc = dyn_cast(decl); + break; + } + } + assert(atomicAddFunc && "atomicAdd function not found"); + llvm::SmallVector atomicArgs = {diff_dx, dfdx()}; + Expr* atomicCall = + BuildCallExprToFunction(atomicAddFunc, atomicArgs); + + // Add it to the body statements. + addToCurrentBlock(atomicCall, direction::reverse); + } else { + auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx()); + // Add it to the body statements. + addToCurrentBlock(add_assign, direction::reverse); + } } } return {cloneE, derivedE, derivedE}; diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu index 01da8a299..725236992 100644 --- a/test/CUDA/GradientKernels.cu +++ b/test/CUDA/GradientKernels.cu @@ -237,6 +237,12 @@ __global__ void add_kernel_5(int *out, int *in) { //CHECK-NEXT: } //CHECK-NEXT:} +__global__ void add_kernel_6(int *a, int *b) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + a[2 * index] = b[0]; + a[2 * index + 1] = b[0]; +} + #define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \ { \ int *fives = (int*)malloc(N * sizeof(int)); \ @@ -326,16 +332,17 @@ int main(void) { int *dummy_in, *dummy_out, *d_out, *d_in; - cudaMalloc(&dummy_in, 5 * sizeof(int)); - cudaMalloc(&dummy_out, 5 * sizeof(int)); - cudaMalloc(&d_out, 5 * sizeof(int)); - cudaMalloc(&d_in, 5 * sizeof(int)); + cudaMalloc(&dummy_in, 10 * sizeof(int)); + cudaMalloc(&dummy_out, 10 * sizeof(int)); + cudaMalloc(&d_out, 10 * sizeof(int)); + cudaMalloc(&d_in, 10 * sizeof(int)); TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5 + TEST_2(add_kernel_6, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out, dummy_in, d_out, d_in, 10); // CHECK-EXEC: 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 cudaFree(dummy_in); cudaFree(dummy_out);