Skip to content

Commit

Permalink
Merge pull request #294 from DrTimothyAldenDavis/dev2
Browse files Browse the repository at this point in the history
9.2.0
  • Loading branch information
DrTimothyAldenDavis authored May 29, 2024
2 parents 6680f9f + 9f2c7d1 commit 9bdf19d
Show file tree
Hide file tree
Showing 97 changed files with 15,326 additions and 14,046 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ endif ( )

# CUDA is under development for now, and not deployed in production:
set ( GRAPHBLAS_USE_CUDA OFF )
# set ( GRAPHBLAS_USE_CUDA ON )
# set ( GRAPHBLAS_USE_CUDA ON ) # FIXME: use this for CUDA development

include ( SuiteSparsePolicy )

Expand Down
61 changes: 61 additions & 0 deletions CUDA/GB_cuda_apply.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
//------------------------------------------------------------------------------
// GB_cuda_apply.hpp: CPU definitions for CUDA apply operations
//------------------------------------------------------------------------------

// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

#ifndef GB_CUDA_EWISE_H
#define GB_CUDA_EWISE_H

#include "GB_cuda.hpp"

GrB_Info GB_cuda_apply_unop_jit
(
// output:
GB_void *Cx,
// input:
const GrB_Type ctype,
const GB_Operator op,
const bool flipij,
const GrB_Matrix A,
const GB_void *ythunk,
// CUDA stream and launch parameters:
cudaStream_t stream,
int32_t gridsz,
int32_t blocksz
) ;

GrB_Info GB_cuda_apply_bind1st_jit
(
// output:
GB_void *Cx,
// input:
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const GB_void *xscalar,
// CUDA stream and launch parameters:
cudaStream_t stream,
int32_t gridsz,
int32_t blocksz
) ;

GrB_Info GB_cuda_apply_bind2nd_jit
(
// output:
GB_void *Cx,
// input:
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const GB_void *xscalar,
// CUDA stream and launch parameters:
cudaStream_t stream,
int32_t gridsz,
int32_t blocksz
) ;

#endif

53 changes: 53 additions & 0 deletions CUDA/GB_cuda_apply_bind1st_jit.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "GB_cuda_apply.hpp"

extern "C"
{
typedef GB_JIT_CUDA_KERNEL_APPLY_BIND1ST_PROTO ((*GB_jit_dl_function)) ;
}


GrB_Info GB_cuda_apply_bind1st_jit
(
// output:
GB_void *Cx,
// input:
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const GB_void *scalarx,
// CUDA stream and launch parameters:
cudaStream_t stream,
int32_t gridsz,
int32_t blocksz
)
{
//--------------------------------------------------------------------------
// encodify the problem
//--------------------------------------------------------------------------

GB_jit_encoding encoding ;
char *suffix ;
uint64_t hash = GB_encodify_ewise (&encoding, &suffix,
GB_JIT_CUDA_KERNEL_APPLYBIND1, false, false, false, GxB_FULL, ctype,
NULL, false, false, op, false, NULL, A) ;

//--------------------------------------------------------------------------
// get the kernel function pointer, loading or compiling it if needed
//--------------------------------------------------------------------------

void *dl_function ;
GrB_Info info = GB_jitifyer_load (&dl_function,
GB_jit_ewise_family, "cuda_apply_bind1st",
hash, &encoding, suffix, NULL, NULL,
(GB_Operator) op, ctype, NULL, A->type) ;
if (info != GrB_SUCCESS) {
return (info) ;
}

//--------------------------------------------------------------------------
// call the jit kernel and return result
//--------------------------------------------------------------------------

GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
return (GB_jit_kernel (Cx, scalarx, A, stream, gridsz, blocksz)) ;
}
53 changes: 53 additions & 0 deletions CUDA/GB_cuda_apply_bind2nd_jit.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "GB_cuda_apply.hpp"

extern "C"
{
typedef GB_JIT_CUDA_KERNEL_APPLY_BIND2ND_PROTO ((*GB_jit_dl_function)) ;
}


GrB_Info GB_cuda_apply_bind2nd_jit
(
// output:
GB_void *Cx,
// input:
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const GB_void *scalarx,
// CUDA stream and launch parameters:
cudaStream_t stream,
int32_t gridsz,
int32_t blocksz
)
{
//--------------------------------------------------------------------------
// encodify the problem
//--------------------------------------------------------------------------

GB_jit_encoding encoding ;
char *suffix ;
uint64_t hash = GB_encodify_ewise (&encoding, &suffix,
GB_JIT_CUDA_KERNEL_APPLYBIND2, false, false, false, GxB_FULL, ctype,
NULL, false, false, op, false, A, NULL) ;

//--------------------------------------------------------------------------
// get the kernel function pointer, loading or compiling it if needed
//--------------------------------------------------------------------------

void *dl_function ;
GrB_Info info = GB_jitifyer_load (&dl_function,
GB_jit_ewise_family, "cuda_apply_bind2nd",
hash, &encoding, suffix, NULL, NULL,
(GB_Operator) op, ctype, A->type, NULL) ;
if (info != GrB_SUCCESS){
return (info) ;
}

//--------------------------------------------------------------------------
// call the jit kernel and return result
//--------------------------------------------------------------------------

GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ;
return (GB_jit_kernel (Cx, A, scalarx, stream, gridsz, blocksz)) ;
}
69 changes: 69 additions & 0 deletions CUDA/GB_cuda_apply_binop.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include "GB_cuda_apply.hpp"

#undef GB_FREE_WORKSPACE
#define GB_FREE_WORKSPACE \
{ \
GB_FREE_WORK (&scalarx_cuda, scalarx_cuda_size) ; \
}

#undef GB_FREE_ALL
#define GB_FREE_ALL ;

#define BLOCK_SIZE 512
#define LOG2_BLOCK_SIZE 9

GrB_Info GB_cuda_apply_binop
(
GB_void *Cx,
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const GB_void *scalarx,
const bool bind1st
)
{
ASSERT (scalarx != NULL) ;
// make a copy of scalarx to ensure it's not on the CPU stack
GB_void *scalarx_cuda = NULL ;
size_t scalarx_cuda_size = 0 ;
if (bind1st)
{
scalarx_cuda = GB_MALLOC_WORK (op->xtype->size, GB_void, &scalarx_cuda_size) ;
}
else
{
scalarx_cuda = GB_MALLOC_WORK (op->ytype->size, GB_void, &scalarx_cuda_size) ;
}
if (scalarx_cuda == NULL)
{
return (GrB_OUT_OF_MEMORY) ;
}
memcpy (scalarx_cuda, scalarx, scalarx_cuda_size) ;

// FIXME: use the stream pool
cudaStream_t stream ;
CUDA_OK (cudaStreamCreate (&stream)) ;

GrB_Index anz = GB_nnz_held (A) ;

int32_t gridsz = GB_ICEIL (anz, BLOCK_SIZE) ;

GrB_Info info ;
if (bind1st) {
info = GB_cuda_apply_bind1st_jit (Cx, ctype, op, A,
scalarx_cuda, stream, gridsz, BLOCK_SIZE) ;
} else {
info = GB_cuda_apply_bind2nd_jit (Cx, ctype, op, A,
scalarx_cuda, stream, gridsz, BLOCK_SIZE) ;
}

if (info == GrB_NO_VALUE) info = GrB_PANIC ;
GB_OK (info) ;

CUDA_OK (cudaStreamSynchronize (stream)) ;
CUDA_OK (cudaStreamDestroy (stream)) ;

GB_FREE_WORKSPACE ;
return GrB_SUCCESS ;

}
30 changes: 30 additions & 0 deletions CUDA/GB_cuda_apply_binop_branch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include "GraphBLAS_cuda.hpp"
#include "GB_cuda.hpp"

bool GB_cuda_apply_binop_branch
(
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A
)
{
bool ok = GB_cuda_type_branch (ctype) && GB_cuda_type_branch (A->type) ;

if (op->xtype != NULL)
{
ok = ok && GB_cuda_type_branch (op->xtype) ;
}
if (op->ytype != NULL)
{
ok = ok && GB_cuda_type_branch (op->ytype) ;
}
if (op->ztype != NULL)
{
ok = ok && GB_cuda_type_branch (op->ztype) ;
}

ok = ok && (op != NULL && op->hash != UINT64_MAX) ;

return (ok) ;
}

60 changes: 60 additions & 0 deletions CUDA/GB_cuda_apply_unop.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include "GB_cuda_apply.hpp"

#undef GB_FREE_WORKSPACE
#define GB_FREE_WORKSPACE \
{ \
GB_FREE_WORK (&ythunk_cuda, ythunk_cuda_size) ; \
}

#undef GB_FREE_ALL
#define GB_FREE_ALL ;

#define BLOCK_SIZE 512
#define LOG2_BLOCK_SIZE 9

GrB_Info GB_cuda_apply_unop
(
GB_void *Cx,
const GrB_Type ctype,
const GB_Operator op,
const bool flipij,
const GrB_Matrix A,
const GB_void *ythunk
)
{

GB_void *ythunk_cuda = NULL ;
size_t ythunk_cuda_size = 0 ;
if (ythunk != NULL && op != NULL && op->ytype != NULL)
{
// make a copy of ythunk, since ythunk might be allocated on
// the CPU stack and thus not accessible to the CUDA kernel.
ythunk_cuda = GB_MALLOC_WORK (op->ytype->size, GB_void, &ythunk_cuda_size) ;
if (ythunk_cuda == NULL)
{
return (GrB_OUT_OF_MEMORY) ;
}
memcpy (ythunk_cuda, ythunk, op->ytype->size) ;
}

// FIXME: use the stream pool
cudaStream_t stream ;
CUDA_OK (cudaStreamCreate (&stream)) ;

GrB_Index anz = GB_nnz_held (A) ;

int32_t gridsz = GB_ICEIL (anz, BLOCK_SIZE) ;

GrB_Info info = GB_cuda_apply_unop_jit (Cx, ctype, op, flipij, A,
ythunk_cuda, stream, gridsz, BLOCK_SIZE) ;

if (info == GrB_NO_VALUE) info = GrB_PANIC ;
GB_OK (info) ;

CUDA_OK (cudaStreamSynchronize (stream)) ;
CUDA_OK (cudaStreamDestroy (stream)) ;

GB_FREE_WORKSPACE ;
return GrB_SUCCESS ;

}
19 changes: 19 additions & 0 deletions CUDA/GB_cuda_apply_unop_branch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include "GraphBLAS_cuda.hpp"
#include "GB_cuda.hpp"

bool GB_cuda_apply_unop_branch
(
const GrB_Type ctype,
const GrB_Matrix A,
const GB_Operator op
)
{
bool ok = (GB_cuda_type_branch (ctype) && GB_cuda_type_branch (A->type))
&& (op != NULL && op->hash != UINT64_MAX);

if (!ok)
{
return false ;
}
return true ;
}
Loading

0 comments on commit 9bdf19d

Please sign in to comment.