Use LookupQualifiedName and BuildDeclarationNameExpr for cudaMemcpyDe…

…viceToHost expr
kchristin22 · Nov 6, 2024 · 21ad43f · 21ad43f
1 parent e1809c0
commit 21ad43f
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 38 deletions.
diff --git a/lib/Differentiator/ReverseModeVisitor.cpp b/lib/Differentiator/ReverseModeVisitor.cpp
@@ -1895,50 +1895,81 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
         PreCallStmts.push_back(BuildDeclStmt(dArgDecl));
         DeclRefExpr* dArgRef = BuildDeclRef(dArgDecl);
         if (isa<CUDAKernelCallExpr>(CE)) {
-          // Create variables to be allocated on the device and passed to
-          // kernel. These need to be pointers because cudaMlloc expects a
-          // double pointer as an arg.
+          // Create variables to be allocated and initialized on the device, and
+          // then be passed to the kernel pullback.
+          //
+          // These need to be pointers because cudaMalloc expects a double
+          // pointer as an arg.
+          // The memory addresses they point to are initialized to zero through
+          // cudaMemset.
+          // After the pullback call, their values will be copied back to the
+          // corresponding _r variables on the host and the device variables
+          // will be freed.
+          //
+          // Example of the generated code:
+          //
+          // double _r0 = 0;
+          // double* _r1 = nullptr;
+          // cudaMalloc(&_r1, sizeof(double));
+          // cudaMemset(_r1, 0, 8);
+          // kernel_pullback<<<...>>>(..., _r1);
+          // cudaMemcpy(&_r0, _r1, 8, cudaMemcpyDeviceToHost);
+          // cudaFree(_r1);
+
+          // Create a literal for the size of the type
           Expr* sizeLiteral = ConstantFolder::synthesizeLiteral(
-              m_Context.IntTy, m_Context, m_Context.getTypeSize(dArgTy));
+              m_Context.IntTy, m_Context, m_Context.getTypeSize(dArgTy) / 8);
           dArgTy = m_Context.getPointerType(dArgTy);
           VarDecl* dArgDeclCUDA =
               BuildVarDecl(dArgTy, "_r", getZeroInit(dArgTy));
 
           // Create the cudaMemcpyDeviceToHost argument
-          IdentifierInfo* idInfo = &m_Context.Idents.get("cudaMemcpyKind");
-          LookupResult result(m_Sema, idInfo, SourceLocation(),
-                              Sema::LookupOrdinaryName);
-          m_Sema.LookupName(result, m_Sema.getCurScope());
-          EnumDecl* cudaMemcpyKindDecl = nullptr;
-          for (NamedDecl* decl : result)
-            if (auto* enumDecl = dyn_cast<EnumDecl>(decl)) {
-              cudaMemcpyKindDecl = enumDecl;
-              break;
-            }
-          if (!cudaMemcpyKindDecl) {
-            diag(DiagnosticsEngine::Error, CE->getEndLoc(),
-                 "Failed to create cudaMemcpy call; cudaMemcpyKind not found");
-          }
-          QualType cudaMemcpyKindType =
-              m_Context.getTypeDeclType(cudaMemcpyKindDecl);
-          EnumConstantDecl* deviceToHostEnumDecl = nullptr;
-          for (EnumConstantDecl* enumConst :
-               cudaMemcpyKindDecl->enumerators()) {
-            if (enumConst->getName() == "cudaMemcpyDeviceToHost") {
-              deviceToHostEnumDecl = enumConst;
-              break;
-            }
-          }
-          if (!deviceToHostEnumDecl) {
+          IdentifierInfo* deviceToHostIdInfo =
+              &m_Context.Idents.get("cudaMemcpyDeviceToHost");
+          DeclarationName deviceToHostName(deviceToHostIdInfo);
+          LookupResult deviceToHostResult(m_Sema, deviceToHostName,
+                                          SourceLocation(),
+                                          Sema::LookupOrdinaryName);
+          m_Sema.LookupQualifiedName(deviceToHostResult,
+                                     m_Context.getTranslationUnitDecl());
+          if (deviceToHostResult.empty()) {
             diag(DiagnosticsEngine::Error, CE->getEndLoc(),
                  "Failed to create cudaMemcpy call; cudaMemcpyDeviceToHost not "
-                 "found");
+                 "found. Creating kernel pullback aborted.");
+            for (std::size_t a = 0; a < CE->getNumArgs(); ++a)
+              CallArgs.push_back(
+                  Clone(CE->getArg(a))); // create a non-const copy
+            Expr* call =
+                m_Sema
+                    .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()),
+                                   Loc, CallArgs, Loc, CUDAExecConfig)
+                    .get();
+            return StmtDiff(call);
+          }
+          CXXScopeSpec SS;
+          Expr* deviceToHostExpr =
+              m_Sema
+                  .BuildDeclarationNameExpr(SS, deviceToHostResult,
+                                            /*ADL=*/false)
+                  .get();
+          if (!deviceToHostExpr) {
+            diag(
+                DiagnosticsEngine::Error, CE->getEndLoc(),
+                "Failed to create cudaMemcpy call; Failed to create expression "
+                "for cudaMemcpyDeviceToHost. Creating kernel pullback "
+                "aborted.");
+            for (std::size_t a = 0; a < CE->getNumArgs(); ++a)
+              CallArgs.push_back(
+                  Clone(CE->getArg(a))); // create a non-const copy
+            Expr* call =
+                m_Sema
+                    .ActOnCallExpr(getCurrentScope(), Clone(CE->getCallee()),
+                                   Loc, CallArgs, Loc, CUDAExecConfig)
+                    .get();
+            return StmtDiff(call);
           }
-          auto* deviceToHostDeclRef = clad_compat::GetResult<Expr*>(
-              m_Sema.BuildDeclRefExpr(deviceToHostEnumDecl, cudaMemcpyKindType,
-                                      CLAD_COMPAT_ExprValueKind_R_or_PR_Value,
-                                      SourceLocation(), nullptr));
 
+          // Add calls to cudaMalloc, cudaMemset, cudaMemcpy, and cudaFree
           PreCallStmts.push_back(BuildDeclStmt(dArgDeclCUDA));
           Expr* refOp = BuildOp(UO_AddrOf, BuildDeclRef(dArgDeclCUDA));
           llvm::SmallVector<Expr*, 3> mallocArgs = {refOp, sizeLiteral};
@@ -1949,7 +1980,7 @@ Expr* getArraySizeExpr(const ArrayType* AT, ASTContext& context,
           PreCallStmts.push_back(GetFunctionCall("cudaMemset", "", memsetArgs));
           llvm::SmallVector<Expr*, 4> cudaMemcpyArgs = {
               BuildOp(UO_AddrOf, dArgRef), BuildDeclRef(dArgDeclCUDA),
-              sizeLiteral, deviceToHostDeclRef};
+              sizeLiteral, deviceToHostExpr};
           PostCallStmts.push_back(
               GetFunctionCall("cudaMemcpy", "", cudaMemcpyArgs));
           llvm::SmallVector<Expr*, 3> freeArgs = {BuildDeclRef(dArgDeclCUDA)};

diff --git a/test/CUDA/GradientKernels.cu b/test/CUDA/GradientKernels.cu
@@ -541,10 +541,10 @@ void launch_add_kernel_4(int *out, int *in, const int N) {
 //CHECK-NEXT:    {
 //CHECK-NEXT:        int _r4 = 0;
 //CHECK-NEXT:        int *_r5 = nullptr;
-//CHECK-NEXT:        cudaMalloc(&_r5, 32);
-//CHECK-NEXT:        cudaMemset(_r5, 0, 32);
+//CHECK-NEXT:        cudaMalloc(&_r5, 4);
+//CHECK-NEXT:        cudaMemset(_r5, 0, 4);
 //CHECK-NEXT:        add_kernel_4_pullback<<<1, 5>>>(out_dev, in_dev, N, _d_out_dev, _d_in_dev, _r5);
-//CHECK-NEXT:        cudaMemcpy(&_r4, _r5, 32, cudaMemcpyDeviceToHost);
+//CHECK-NEXT:        cudaMemcpy(&_r4, _r5, 4, cudaMemcpyDeviceToHost);
 //CHECK-NEXT:        cudaFree(_r5);
 //CHECK-NEXT:    }
 //CHECK-NEXT:    {