diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
index 4d600caf7..edfa323bf 100644
--- a/lib/Differentiator/ReverseModeVisitor.cpp
+++ b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -1485,9 +1485,38 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         BuildArraySubscript(target, forwSweepDerivativeIndices);
     // Create the (target += dfdx) statement.
     if (dfdx()) {
-      auto* add_assign = BuildOp(BO_AddAssign, result, dfdx());
-      // Add it to the body statements.
-      addToCurrentBlock(add_assign, direction::reverse);
+      if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>()) {
+        DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd");
+        LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(),
+                                  Sema::LookupOrdinaryName);
+        m_Sema.LookupQualifiedName(lookupResult,
+                                   m_Context.getTranslationUnitDecl());
+
+        FunctionDecl* atomicAddFunc = nullptr;
+        for (LookupResult::iterator it = lookupResult.begin();
+             it != lookupResult.end(); it++) {
+          NamedDecl* decl = *it;
+          // FIXME: check for underlying types of the pointers
+          if (dyn_cast<FunctionDecl>(decl)->getReturnType() ==
+              result->getType()) {
+            printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(),
+                   result->getType().getAsString().c_str());
+            atomicAddFunc = dyn_cast<FunctionDecl>(decl);
+            break;
+          }
+        }
+        assert(atomicAddFunc && "atomicAdd function not found");
+        Expr* addrOfRes = BuildOp(UnaryOperatorKind::UO_AddrOf, result);
+        llvm::SmallVector<Expr*, 2> atomicArgs = {addrOfRes, dfdx()};
+        Expr* atomicCall = BuildCallExprToFunction(atomicAddFunc, atomicArgs);
+
+        // Add it to the body statements.
+        addToCurrentBlock(atomicCall, direction::reverse);
+      } else {
+        auto* add_assign = BuildOp(BO_AddAssign, result, dfdx());
+        // Add it to the body statements.
+        addToCurrentBlock(add_assign, direction::reverse);
+      }
     }
     if (m_ExternalSource)
       m_ExternalSource->ActAfterProcessingArraySubscriptExpr(valueForRevSweep);
@@ -2279,9 +2308,35 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         derivedE = BuildOp(UnaryOperatorKind::UO_Deref, diff_dx);
         // Create the (target += dfdx) statement.
         if (dfdx()) {
-          auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx());
-          // Add it to the body statements.
-          addToCurrentBlock(add_assign, direction::reverse);
+          if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>()) {
+            DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd");
+            LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(),
+                                      Sema::LookupOrdinaryName);
+            m_Sema.LookupQualifiedName(lookupResult, m_Context.getTranslationUnitDecl());
+
+            FunctionDecl* atomicAddFunc = nullptr;
+            for (LookupResult::iterator it = lookupResult.begin();
+                it != lookupResult.end(); it++) {
+              NamedDecl* decl = *it;
+              // FIXME: check for underlying types of the pointers
+              if (dyn_cast<FunctionDecl>(decl)->getReturnType() == derivedE->getType()) {
+                printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(), derivedE->getType().getAsString().c_str());
+                atomicAddFunc = dyn_cast<FunctionDecl>(decl);
+                break;
+              }
+            }
+            assert(atomicAddFunc && "atomicAdd function not found");
+            llvm::SmallVector<Expr*, 2> atomicArgs  = {diff_dx, dfdx()};
+            Expr* atomicCall =
+                BuildCallExprToFunction(atomicAddFunc, atomicArgs);
+
+            // Add it to the body statements.
+            addToCurrentBlock(atomicCall, direction::reverse);
+          } else {
+            auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx());
+            // Add it to the body statements.
+            addToCurrentBlock(add_assign, direction::reverse);
+          }
         }
       }
       return {cloneE, derivedE, derivedE};
diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
index 01da8a299..725236992 100644
--- a/test/CUDA/GradientKernels.cu
+++ b/test/CUDA/GradientKernels.cu
@@ -237,6 +237,12 @@ __global__ void add_kernel_5(int *out, int *in) {
 //CHECK-NEXT:    }
 //CHECK-NEXT:}
 
+__global__ void add_kernel_6(int *a, int *b) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  a[2 * index] = b[0];
+  a[2 * index + 1] = b[0];
+}
+
 #define TEST(F, grid, block, shared_mem, use_stream, x, dx, N)              \
   {                                                                         \
     int *fives = (int*)malloc(N * sizeof(int));                             \
@@ -326,16 +332,17 @@ int main(void) {
 
 
   int *dummy_in, *dummy_out, *d_out, *d_in;
-  cudaMalloc(&dummy_in, 5 * sizeof(int));
-  cudaMalloc(&dummy_out, 5 * sizeof(int));
-  cudaMalloc(&d_out, 5 * sizeof(int));
-  cudaMalloc(&d_in, 5 * sizeof(int));
+  cudaMalloc(&dummy_in, 10 * sizeof(int));
+  cudaMalloc(&dummy_out, 10 * sizeof(int));
+  cudaMalloc(&d_out, 10 * sizeof(int));
+  cudaMalloc(&d_in, 10 * sizeof(int));
 
   TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
   TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
   TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
   TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
   TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
+  TEST_2(add_kernel_6, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out, dummy_in, d_out, d_in, 10); // CHECK-EXEC: 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 
   cudaFree(dummy_in);
   cudaFree(dummy_out);