Skip to content

Commit

Permalink
Make add-assign op atomic and add test
Browse files Browse the repository at this point in the history
  • Loading branch information
kchristin22 authored and vgvassilev committed Sep 24, 2024
1 parent 2e5560e commit 02f87f3
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 10 deletions.
67 changes: 61 additions & 6 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1485,9 +1485,38 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
BuildArraySubscript(target, forwSweepDerivativeIndices);
// Create the (target += dfdx) statement.
if (dfdx()) {
auto* add_assign = BuildOp(BO_AddAssign, result, dfdx());
// Add it to the body statements.
addToCurrentBlock(add_assign, direction::reverse);
if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>()) {
DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd");
LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(),
Sema::LookupOrdinaryName);
m_Sema.LookupQualifiedName(lookupResult,
m_Context.getTranslationUnitDecl());

FunctionDecl* atomicAddFunc = nullptr;
for (LookupResult::iterator it = lookupResult.begin();
it != lookupResult.end(); it++) {
NamedDecl* decl = *it;
// FIXME: check for underlying types of the pointers
if (dyn_cast<FunctionDecl>(decl)->getReturnType() ==
result->getType()) {
printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(),
result->getType().getAsString().c_str());
atomicAddFunc = dyn_cast<FunctionDecl>(decl);
break;
}
}
assert(atomicAddFunc && "atomicAdd function not found");
Expr* addrOfRes = BuildOp(UnaryOperatorKind::UO_AddrOf, result);
llvm::SmallVector<Expr*, 2> atomicArgs = {addrOfRes, dfdx()};
Expr* atomicCall = BuildCallExprToFunction(atomicAddFunc, atomicArgs);

// Add it to the body statements.
addToCurrentBlock(atomicCall, direction::reverse);
} else {
auto* add_assign = BuildOp(BO_AddAssign, result, dfdx());
// Add it to the body statements.
addToCurrentBlock(add_assign, direction::reverse);
}
}
if (m_ExternalSource)
m_ExternalSource->ActAfterProcessingArraySubscriptExpr(valueForRevSweep);
Expand Down Expand Up @@ -2279,9 +2308,35 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
derivedE = BuildOp(UnaryOperatorKind::UO_Deref, diff_dx);
// Create the (target += dfdx) statement.
if (dfdx()) {
auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx());
// Add it to the body statements.
addToCurrentBlock(add_assign, direction::reverse);
if (m_DiffReq->hasAttr<clang::CUDAGlobalAttr>()) {
DeclarationName atomicAddId = &m_Context.Idents.get("atomicAdd");
LookupResult lookupResult(m_Sema, atomicAddId, SourceLocation(),
Sema::LookupOrdinaryName);
m_Sema.LookupQualifiedName(lookupResult, m_Context.getTranslationUnitDecl());

FunctionDecl* atomicAddFunc = nullptr;
for (LookupResult::iterator it = lookupResult.begin();
it != lookupResult.end(); it++) {
NamedDecl* decl = *it;
// FIXME: check for underlying types of the pointers
if (dyn_cast<FunctionDecl>(decl)->getReturnType() == derivedE->getType()) {
printf("decl: %s, type: %s\n", decl->getNameAsString().c_str(), derivedE->getType().getAsString().c_str());
atomicAddFunc = dyn_cast<FunctionDecl>(decl);
break;
}
}
assert(atomicAddFunc && "atomicAdd function not found");
llvm::SmallVector<Expr*, 2> atomicArgs = {diff_dx, dfdx()};
Expr* atomicCall =
BuildCallExprToFunction(atomicAddFunc, atomicArgs);

// Add it to the body statements.
addToCurrentBlock(atomicCall, direction::reverse);
} else {
auto* add_assign = BuildOp(BO_AddAssign, derivedE, dfdx());
// Add it to the body statements.
addToCurrentBlock(add_assign, direction::reverse);
}
}
}
return {cloneE, derivedE, derivedE};
Expand Down
15 changes: 11 additions & 4 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,12 @@ __global__ void add_kernel_5(int *out, int *in) {
//CHECK-NEXT: }
//CHECK-NEXT:}

__global__ void add_kernel_6(int *a, int *b) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
a[2 * index] = b[0];
a[2 * index + 1] = b[0];
}

#define TEST(F, grid, block, shared_mem, use_stream, x, dx, N) \
{ \
int *fives = (int*)malloc(N * sizeof(int)); \
Expand Down Expand Up @@ -326,16 +332,17 @@ int main(void) {


int *dummy_in, *dummy_out, *d_out, *d_in;
cudaMalloc(&dummy_in, 5 * sizeof(int));
cudaMalloc(&dummy_out, 5 * sizeof(int));
cudaMalloc(&d_out, 5 * sizeof(int));
cudaMalloc(&d_in, 5 * sizeof(int));
cudaMalloc(&dummy_in, 10 * sizeof(int));
cudaMalloc(&dummy_out, 10 * sizeof(int));
cudaMalloc(&d_out, 10 * sizeof(int));
cudaMalloc(&d_in, 10 * sizeof(int));

TEST_2(add_kernel, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_2, dim3(1), dim3(5, 1, 1), 0, true, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_3, dim3(5, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_4, dim3(1), dim3(5, 1, 1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_5, dim3(2, 1, 1), dim3(1), 0, false, "in, out", dummy_out, dummy_in, d_out, d_in, 5); // CHECK-EXEC: 5, 5, 5, 5, 5
TEST_2(add_kernel_6, dim3(1), dim3(5, 1, 1), 0, false, "a, b", dummy_out, dummy_in, d_out, d_in, 10); // CHECK-EXEC: 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

cudaFree(dummy_in);
cudaFree(dummy_out);
Expand Down

0 comments on commit 02f87f3

Please sign in to comment.