Merge pull request #294 from DrTimothyAldenDavis/dev2

9.2.0
DrTimothyAldenDavis · May 29, 2024 · 9bdf19d · 9bdf19d
2 parents 6680f9f + 9f2c7d1
commit 9bdf19d
Show file tree

Hide file tree

Showing 97 changed files with 15,326 additions and 14,046 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -42,7 +42,7 @@ endif ( )
 
 # CUDA is under development for now, and not deployed in production:
   set ( GRAPHBLAS_USE_CUDA OFF )
-# set ( GRAPHBLAS_USE_CUDA ON )
+# set ( GRAPHBLAS_USE_CUDA ON )     # FIXME: use this for CUDA development
 
 include ( SuiteSparsePolicy )
 

diff --git a/CUDA/GB_cuda_apply.hpp b/CUDA/GB_cuda_apply.hpp
@@ -0,0 +1,61 @@
+//------------------------------------------------------------------------------
+// GB_cuda_apply.hpp: CPU definitions for CUDA apply operations
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CUDA_EWISE_H
+#define GB_CUDA_EWISE_H
+
+#include "GB_cuda.hpp"
+
+GrB_Info GB_cuda_apply_unop_jit
+(
+    // output:
+    GB_void *Cx,
+    // input:
+    const GrB_Type ctype,
+    const GB_Operator op,
+    const bool flipij,
+    const GrB_Matrix A,
+    const GB_void *ythunk,
+    // CUDA stream and launch parameters:
+    cudaStream_t stream,
+    int32_t gridsz,
+    int32_t blocksz
+) ;
+
+GrB_Info GB_cuda_apply_bind1st_jit
+(
+    // output:
+    GB_void *Cx,
+    // input:
+    const GrB_Type ctype,
+    const GrB_BinaryOp op,
+    const GrB_Matrix A,
+    const GB_void *xscalar,
+    // CUDA stream and launch parameters:
+    cudaStream_t stream,
+    int32_t gridsz,
+    int32_t blocksz
+) ;
+
+GrB_Info GB_cuda_apply_bind2nd_jit
+(
+    // output:
+    GB_void *Cx,
+    // input:
+    const GrB_Type ctype,
+    const GrB_BinaryOp op,
+    const GrB_Matrix A,
+    const GB_void *xscalar,
+    // CUDA stream and launch parameters:
+    cudaStream_t stream,
+    int32_t gridsz,
+    int32_t blocksz
+) ;
+
+#endif
+
diff --git a/CUDA/GB_cuda_apply_bind1st_jit.cpp b/CUDA/GB_cuda_apply_bind1st_jit.cpp
@@ -0,0 +1,53 @@
+#include "GB_cuda_apply.hpp"
+
+extern "C"
+{
+    typedef GB_JIT_CUDA_KERNEL_APPLY_BIND1ST_PROTO ((*GB_jit_dl_function)) ;
+}
+
+
+GrB_Info GB_cuda_apply_bind1st_jit
+(
+    // output:
+    GB_void *Cx,
+    // input:
+    const GrB_Type ctype,
+    const GrB_BinaryOp op,
+    const GrB_Matrix A,
+    const GB_void *scalarx,
+    // CUDA stream and launch parameters:
+    cudaStream_t stream,
+    int32_t gridsz,
+    int32_t blocksz
+)
+{
+    //--------------------------------------------------------------------------
+    // encodify the problem
+    //--------------------------------------------------------------------------
+
+    GB_jit_encoding encoding ;
+    char *suffix ;
+    uint64_t hash = GB_encodify_ewise (&encoding, &suffix,
+        GB_JIT_CUDA_KERNEL_APPLYBIND1, false, false, false, GxB_FULL, ctype, 
+        NULL, false, false, op, false, NULL, A) ;
+
+    //--------------------------------------------------------------------------
+    // get the kernel function pointer, loading or compiling it if needed
+    //--------------------------------------------------------------------------
+
+    void *dl_function ;
+    GrB_Info info = GB_jitifyer_load (&dl_function,
+        GB_jit_ewise_family, "cuda_apply_bind1st",
+        hash, &encoding, suffix, NULL, NULL,
+        (GB_Operator) op, ctype, NULL, A->type) ;
+    if (info != GrB_SUCCESS) {
+        return (info) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // call the jit kernel and return result
+    //--------------------------------------------------------------------------
+
+    GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
+    return (GB_jit_kernel (Cx, scalarx, A, stream, gridsz, blocksz)) ;
+}
diff --git a/CUDA/GB_cuda_apply_bind2nd_jit.cpp b/CUDA/GB_cuda_apply_bind2nd_jit.cpp
@@ -0,0 +1,53 @@
+#include "GB_cuda_apply.hpp"
+
+extern "C"
+{
+    typedef GB_JIT_CUDA_KERNEL_APPLY_BIND2ND_PROTO ((*GB_jit_dl_function)) ;
+}
+
+
+GrB_Info GB_cuda_apply_bind2nd_jit
+(
+    // output:
+    GB_void *Cx,
+    // input:
+    const GrB_Type ctype,
+    const GrB_BinaryOp op,
+    const GrB_Matrix A,
+    const GB_void *scalarx,
+    // CUDA stream and launch parameters:
+    cudaStream_t stream,
+    int32_t gridsz,
+    int32_t blocksz
+)
+{
+    //--------------------------------------------------------------------------
+    // encodify the problem
+    //--------------------------------------------------------------------------
+
+    GB_jit_encoding encoding ;
+    char *suffix ;
+    uint64_t hash = GB_encodify_ewise (&encoding, &suffix,
+        GB_JIT_CUDA_KERNEL_APPLYBIND2, false, false, false, GxB_FULL, ctype, 
+        NULL, false, false, op, false, A, NULL) ;
+
+    //--------------------------------------------------------------------------
+    // get the kernel function pointer, loading or compiling it if needed
+    //--------------------------------------------------------------------------
+
+    void *dl_function ;
+    GrB_Info info = GB_jitifyer_load (&dl_function,
+        GB_jit_ewise_family, "cuda_apply_bind2nd",
+        hash, &encoding, suffix, NULL, NULL,
+        (GB_Operator) op, ctype, A->type, NULL) ;
+    if (info != GrB_SUCCESS){ 
+        return (info) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // call the jit kernel and return result
+    //--------------------------------------------------------------------------
+
+    GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
+    return (GB_jit_kernel (Cx, A, scalarx, stream, gridsz, blocksz)) ;
+}
diff --git a/CUDA/GB_cuda_apply_binop.cpp b/CUDA/GB_cuda_apply_binop.cpp
@@ -0,0 +1,69 @@
+#include "GB_cuda_apply.hpp"
+
+#undef GB_FREE_WORKSPACE
+#define GB_FREE_WORKSPACE                               \
+{                                                       \
+    GB_FREE_WORK (&scalarx_cuda, scalarx_cuda_size) ;   \
+}
+
+#undef GB_FREE_ALL
+#define GB_FREE_ALL ;
+
+#define BLOCK_SIZE 512
+#define LOG2_BLOCK_SIZE 9
+
+GrB_Info GB_cuda_apply_binop
+(
+    GB_void *Cx,
+    const GrB_Type ctype,
+    const GrB_BinaryOp op,
+    const GrB_Matrix A,
+    const GB_void *scalarx,
+    const bool bind1st
+)
+{
+    ASSERT (scalarx != NULL) ;
+    // make a copy of scalarx to ensure it's not on the CPU stack
+    GB_void *scalarx_cuda = NULL ;
+    size_t scalarx_cuda_size = 0 ;
+    if (bind1st)
+    {
+        scalarx_cuda = GB_MALLOC_WORK (op->xtype->size, GB_void, &scalarx_cuda_size) ;
+    }
+    else
+    {
+        scalarx_cuda = GB_MALLOC_WORK (op->ytype->size, GB_void, &scalarx_cuda_size) ;
+    }
+    if (scalarx_cuda == NULL)
+    {
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+    memcpy (scalarx_cuda, scalarx, scalarx_cuda_size) ;
+
+    // FIXME: use the stream pool
+    cudaStream_t stream ;
+    CUDA_OK (cudaStreamCreate (&stream)) ;
+
+    GrB_Index anz = GB_nnz_held (A) ;
+
+    int32_t gridsz = GB_ICEIL (anz, BLOCK_SIZE) ;
+
+    GrB_Info info ;
+    if (bind1st) {
+        info = GB_cuda_apply_bind1st_jit (Cx, ctype, op, A, 
+            scalarx_cuda, stream, gridsz, BLOCK_SIZE) ;
+    } else {
+        info = GB_cuda_apply_bind2nd_jit (Cx, ctype, op, A,
+            scalarx_cuda, stream, gridsz, BLOCK_SIZE) ;
+    }
+
+    if (info == GrB_NO_VALUE) info = GrB_PANIC ;
+    GB_OK (info) ;
+
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
+    CUDA_OK (cudaStreamDestroy (stream)) ;
+
+    GB_FREE_WORKSPACE ;
+    return GrB_SUCCESS ; 
+
+}
diff --git a/CUDA/GB_cuda_apply_binop_branch.cpp b/CUDA/GB_cuda_apply_binop_branch.cpp
@@ -0,0 +1,30 @@
+#include "GraphBLAS_cuda.hpp"
+#include "GB_cuda.hpp"
+
+bool GB_cuda_apply_binop_branch
+(
+    const GrB_Type ctype,
+    const GrB_BinaryOp op,
+    const GrB_Matrix A
+)
+{
+    bool ok = GB_cuda_type_branch (ctype) && GB_cuda_type_branch (A->type) ;
+
+    if (op->xtype != NULL)
+    {
+        ok = ok && GB_cuda_type_branch (op->xtype) ;
+    }
+    if (op->ytype != NULL)
+    {
+        ok = ok && GB_cuda_type_branch (op->ytype) ;
+    }
+    if (op->ztype != NULL)
+    {
+        ok = ok && GB_cuda_type_branch (op->ztype) ;
+    }
+
+    ok = ok && (op != NULL && op->hash != UINT64_MAX) ; 
+
+    return (ok) ;
+}
+
diff --git a/CUDA/GB_cuda_apply_unop.cpp b/CUDA/GB_cuda_apply_unop.cpp
@@ -0,0 +1,60 @@
+#include "GB_cuda_apply.hpp"
+
+#undef GB_FREE_WORKSPACE
+#define GB_FREE_WORKSPACE                           \
+{                                                   \
+    GB_FREE_WORK (&ythunk_cuda, ythunk_cuda_size) ; \
+}
+
+#undef GB_FREE_ALL
+#define GB_FREE_ALL ;
+
+#define BLOCK_SIZE 512
+#define LOG2_BLOCK_SIZE 9
+
+GrB_Info GB_cuda_apply_unop
+(
+    GB_void *Cx,
+    const GrB_Type ctype,
+    const GB_Operator op,
+    const bool flipij,
+    const GrB_Matrix A,
+    const GB_void *ythunk
+)
+{
+
+    GB_void *ythunk_cuda = NULL ;
+    size_t ythunk_cuda_size = 0 ;
+    if (ythunk != NULL && op != NULL && op->ytype != NULL)
+    {
+        // make a copy of ythunk, since ythunk might be allocated on
+        // the CPU stack and thus not accessible to the CUDA kernel.
+        ythunk_cuda = GB_MALLOC_WORK (op->ytype->size, GB_void, &ythunk_cuda_size) ;
+        if (ythunk_cuda == NULL)
+        {
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+        memcpy (ythunk_cuda, ythunk, op->ytype->size) ;
+    }
+
+    // FIXME: use the stream pool
+    cudaStream_t stream ;
+    CUDA_OK (cudaStreamCreate (&stream)) ;
+
+    GrB_Index anz = GB_nnz_held (A) ;
+
+    int32_t gridsz = GB_ICEIL (anz, BLOCK_SIZE) ; 
+
+    GrB_Info info = GB_cuda_apply_unop_jit (Cx, ctype, op, flipij, A, 
+        ythunk_cuda, stream, gridsz, BLOCK_SIZE) ;
+
+    if (info == GrB_NO_VALUE) info = GrB_PANIC ;
+    GB_OK (info) ;
+
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
+    CUDA_OK (cudaStreamDestroy (stream)) ;
+
+    GB_FREE_WORKSPACE ;
+    return GrB_SUCCESS ; 
+
+}
diff --git a/CUDA/GB_cuda_apply_unop_branch.cpp b/CUDA/GB_cuda_apply_unop_branch.cpp
@@ -0,0 +1,19 @@
+#include "GraphBLAS_cuda.hpp"
+#include "GB_cuda.hpp"
+
+bool GB_cuda_apply_unop_branch
+(
+    const GrB_Type ctype,
+    const GrB_Matrix A,
+    const GB_Operator op
+)
+{
+    bool ok = (GB_cuda_type_branch (ctype) && GB_cuda_type_branch (A->type)) 
+        && (op != NULL && op->hash != UINT64_MAX);
+
+    if (!ok)
+    {
+        return false ;
+    }
+    return true ;
+}