Skip to content

Commit

Permalink
Use LookupQualifiedName and BuildDeclarationNameExpr for cudaMemcpyDe…
Browse files Browse the repository at this point in the history
…viceToHost expr
  • Loading branch information
kchristin22 committed Nov 6, 2024
1 parent e1809c0 commit 21ad43f
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 38 deletions.
101 changes: 66 additions & 35 deletions lib/Differentiator/ReverseModeVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1895,50 +1895,81 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
PreCallStmts.push_back(BuildDeclStmt(dArgDecl));
DeclRefExpr* dArgRef = BuildDeclRef(dArgDecl);
if (isa<CUDAKernelCallExpr>(CE)) {
// Create variables to be allocated on the device and passed to
// kernel. These need to be pointers because cudaMlloc expects a
// double pointer as an arg.
// Create variables to be allocated and initialized on the device, and
// then be passed to the kernel pullback.
//
// These need to be pointers because cudaMalloc expects a double
// pointer as an arg.
// The memory addresses they point to are initialized to zero through
// cudaMemset.
// After the pullback call, their values will be copied back to the
// corresponding _r variables on the host and the device variables
// will be freed.
//
// Example of the generated code:
//
// double _r0 = 0;
// double* _r1 = nullptr;
// cudaMalloc(&_r1, sizeof(double));
// cudaMemset(_r1, 0, 8);
// kernel_pullback<<<...>>>(..., _r1);
// cudaMemcpy(&_r0, _r1, 8, cudaMemcpyDeviceToHost);
// cudaFree(_r1);

// Create a literal for the size of the type
Expr* sizeLiteral = ConstantFolder::synthesizeLiteral(
m_Context.IntTy, m_Context, m_Context.getTypeSize(dArgTy));
m_Context.IntTy, m_Context, m_Context.getTypeSize(dArgTy) / 8);
dArgTy = m_Context.getPointerType(dArgTy);
VarDecl* dArgDeclCUDA =
BuildVarDecl(dArgTy, "_r", getZeroInit(dArgTy));

// Create the cudaMemcpyDeviceToHost argument
IdentifierInfo* idInfo = &m_Context.Idents.get("cudaMemcpyKind");
LookupResult result(m_Sema, idInfo, SourceLocation(),
Sema::LookupOrdinaryName);
m_Sema.LookupName(result, m_Sema.getCurScope());
EnumDecl* cudaMemcpyKindDecl = nullptr;
for (NamedDecl* decl : result)
if (auto* enumDecl = dyn_cast<EnumDecl>(decl)) {
cudaMemcpyKindDecl = enumDecl;
break;
}
if (!cudaMemcpyKindDecl) {
diag(DiagnosticsEngine::Error, CE->getEndLoc(),
"Failed to create cudaMemcpy call; cudaMemcpyKind not found");
}
QualType cudaMemcpyKindType =
m_Context.getTypeDeclType(cudaMemcpyKindDecl);
EnumConstantDecl* deviceToHostEnumDecl = nullptr;
for (EnumConstantDecl* enumConst :
cudaMemcpyKindDecl->enumerators()) {
if (enumConst->getName() == "cudaMemcpyDeviceToHost") {
deviceToHostEnumDecl = enumConst;
break;
}
}
if (!deviceToHostEnumDecl) {
IdentifierInfo* deviceToHostIdInfo =
&m_Context.Idents.get("cudaMemcpyDeviceToHost");
DeclarationName deviceToHostName(deviceToHostIdInfo);
LookupResult deviceToHostResult(m_Sema, deviceToHostName,
SourceLocation(),
Sema::LookupOrdinaryName);
m_Sema.LookupQualifiedName(deviceToHostResult,
m_Context.getTranslationUnitDecl());
if (deviceToHostResult.empty()) {
diag(DiagnosticsEngine::Error, CE->getEndLoc(),
"Failed to create cudaMemcpy call; cudaMemcpyDeviceToHost not "
"found");
"found. Creating kernel pullback aborted.");
for (std::size_t a = 0; a < CE->getNumArgs(); ++a)
CallArgs.push_back(
Clone(CE->getArg(a))); // create a non-const copy
Expr* call =
m_Sema
.ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()),
Loc, CallArgs, Loc, CUDAExecConfig)
.get();
return StmtDiff(call);
}
CXXScopeSpec SS;
Expr* deviceToHostExpr =
m_Sema
.BuildDeclarationNameExpr(SS, deviceToHostResult,
/*ADL=*/false)
.get();
if (!deviceToHostExpr) {
diag(
DiagnosticsEngine::Error, CE->getEndLoc(),
"Failed to create cudaMemcpy call; Failed to create expression "
"for cudaMemcpyDeviceToHost. Creating kernel pullback "
"aborted.");
for (std::size_t a = 0; a < CE->getNumArgs(); ++a)
CallArgs.push_back(
Clone(CE->getArg(a))); // create a non-const copy
Expr* call =
m_Sema
.ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()),
Loc, CallArgs, Loc, CUDAExecConfig)
.get();
return StmtDiff(call);
}
auto* deviceToHostDeclRef = clad_compat::GetResult<Expr*>(
m_Sema.BuildDeclRefExpr(deviceToHostEnumDecl, cudaMemcpyKindType,
CLAD_COMPAT_ExprValueKind_R_or_PR_Value,
SourceLocation(), nullptr));

// Add calls to cudaMalloc, cudaMemset, cudaMemcpy, and cudaFree
PreCallStmts.push_back(BuildDeclStmt(dArgDeclCUDA));
Expr* refOp = BuildOp(UO_AddrOf, BuildDeclRef(dArgDeclCUDA));
llvm::SmallVector<Expr*, 3> mallocArgs = {refOp, sizeLiteral};
Expand All @@ -1949,7 +1980,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
PreCallStmts.push_back(GetFunctionCall("cudaMemset", "", memsetArgs));
llvm::SmallVector<Expr*, 4> cudaMemcpyArgs = {
BuildOp(UO_AddrOf, dArgRef), BuildDeclRef(dArgDeclCUDA),
sizeLiteral, deviceToHostDeclRef};
sizeLiteral, deviceToHostExpr};
PostCallStmts.push_back(
GetFunctionCall("cudaMemcpy", "", cudaMemcpyArgs));
llvm::SmallVector<Expr*, 3> freeArgs = {BuildDeclRef(dArgDeclCUDA)};
Expand Down
6 changes: 3 additions & 3 deletions test/CUDA/GradientKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -541,10 +541,10 @@ void launch_add_kernel_4(int *out, int *in, const int N) {
//CHECK-NEXT: {
//CHECK-NEXT: int _r4 = 0;
//CHECK-NEXT: int *_r5 = nullptr;
//CHECK-NEXT: cudaMalloc(&_r5, 32);
//CHECK-NEXT: cudaMemset(_r5, 0, 32);
//CHECK-NEXT: cudaMalloc(&_r5, 4);
//CHECK-NEXT: cudaMemset(_r5, 0, 4);
//CHECK-NEXT: add_kernel_4_pullback<<<1, 5>>>(out_dev, in_dev, N, _d_out_dev, _d_in_dev, _r5);
//CHECK-NEXT: cudaMemcpy(&_r4, _r5, 32, cudaMemcpyDeviceToHost);
//CHECK-NEXT: cudaMemcpy(&_r4, _r5, 4, cudaMemcpyDeviceToHost);
//CHECK-NEXT: cudaFree(_r5);
//CHECK-NEXT: }
//CHECK-NEXT: {
Expand Down

0 comments on commit 21ad43f

Please sign in to comment.