numba · sklam · Jun 27, 2022 · Dec 6, 2021 · Dec 6, 2021 · Dec 8, 2021
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,6 @@ ENV/
 
 # mypy
 .mypy_cache/
+
+# Vim swap files
+.*.swp
diff --git a/examples/cuda/curand/example.py b/examples/cuda/curand/example.py
@@ -0,0 +1,93 @@
+# Demonstration of calling cuRAND functions from Numba kernels. Shim functions
+# in a .cu file are used to access the cuRAND functions from Numba. This is
+# based on the cuRAND device API example in:
+# https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-example
+#
+# The result produced by this example agrees with the documentation example.
+# E.g. on a particular configuration:
+#
+# Output from this example:                  0.4999931156635
+# Output from cuRAND documentation example:  0.4999931156635
+
+from numba import cuda
+from numba_curand import (curand_init, curand, curand_state_arg_handler,
+                          CurandStates)
+import numpy as np
+
+# Various parameters
+
+threads = 64
+blocks = 64
+nthreads = blocks * threads
+
+sample_count = 10000
+repetitions = 50
+
+
+# State initialization kernel
+
+@cuda.jit(link=['shim.cu'], extensions=[curand_state_arg_handler])
+def setup(states):
+    i = cuda.grid(1)
+    curand_init(1234, i, 0, states, i)
+
+
+# Random sampling kernel - computes the fraction of numbers with low bits set
+# from a random distribution.
+
+@cuda.jit(link=['shim.cu'], extensions=[curand_state_arg_handler])
+def count_low_bits_native(states, sample_count, results):
+    i = cuda.grid(1)
+    count = 0
+
+    # Copy state to local memory
+    # XXX: TBC
+
+    # Generate pseudo-random numbers
+    for sample in range(sample_count):
+        x = curand(states, i)
+
+        # Check if low bit set
+        if(x & 1):
+            count += 1
+
+    # Copy state back to global memory
+    # XXX: TBC
+
+    # Store results
+    results[i] += count
+
+
+# Create state on the device. The CUDA Array Interface provides a convenient
+# way to get the pointer needed for the shim functions.
+
+# Initialise cuRAND state
+
+states = CurandStates(nthreads)
+setup[blocks, threads](states)
+
+# Run random sampling kernel
+
+results = cuda.to_device(np.zeros(nthreads, dtype=np.int32))
+
+for i in range(repetitions):
+    count_low_bits_native[blocks, threads](
+        states, sample_count, results)
+
+
+# Collect the results and summarize them. This could have been done on
+# device, but the corresponding CUDA C++ sample does it on the host, and
+# we're following that example.
+
+host_results = results.copy_to_host()
+
+total = 0
+for i in range(nthreads):
+    total += host_results[i]
+
+# Use float32 to show an exact match between this and the cuRAND
+# documentation example
+fraction = (np.float32(total) /
+            np.float32(nthreads * sample_count * repetitions))
+
+print(f"Fraction with low bit set was {fraction:17.13f}")
diff --git a/examples/cuda/curand/numba_curand.py b/examples/cuda/curand/numba_curand.py
@@ -0,0 +1,118 @@
+# Numba extension for cuRAND functions. Presently only implements:
+#
+# - curand_init()
+# - curand()
+#
+# This is enough for a proof-of-concept embedded calls to cuRAND functions in
+# Numba kernels.
+
+from numba import cuda, types
+from numba.core.extending import models, register_model, typeof_impl
+
+import numpy as np
+
+
+# cuRAND state type as a NumPy dtype - this mirrors the state defined in
+# curand_kernel.h. Can be used to inspect the state through the device array
+# held by CurandStates.
+
+state_fields = [
+    ('d', np.int32),
+    ('v', np.int32, 5),
+    ('boxmuller_flag', np.int32),
+    ('boxmuller_flag_double', np.int32),
+    ('boxmuller_extra', np.float32),
+    ('boxmuller_extra_double', np.float64),
+]
+
+curandState = np.dtype(state_fields, align=True)
+
+
+# Hold an array of cuRAND states - somewhat analagous to a curandState* in
+# C/C++.
+
+class CurandStates:
+    def __init__(self, n):
+        self._array = cuda.device_array(n, dtype=curandState)
+
+    @property
+    def data(self):
+        return self._array.__cuda_array_interface__['data'][0]
+
+
+# Numba typing for cuRAND state.
+
+class CurandState(types.Type):
+    def __init__(self):
+        super().__init__(name='CurandState')
+
+
+curand_state = CurandState()
+
+
+class CurandStatePointer(types.Type):
+    def __init__(self):
+        self.dtype = curand_state
+        super().__init__(name='CurandState*')
+
+
+curand_state_pointer = CurandStatePointer()
+
+
+@typeof_impl.register(CurandStates)
+def typeof_curand_states(val, c):
+    return curand_state_pointer
+
+
+# The CurandState model mirrors the C/C++ structure, and the state pointer
+# represented similarly to other pointers.
+
+@register_model(CurandState)
+class curand_state_model(models.StructModel):
+    def __init__(self, dmm, fe_type):
+        members = [
+            ('d', types.int32),
+            ('v', types.UniTuple(types.int32, 5)),
+            ('boxmuller_flag', types.int32),
+            ('boxmuller_flag_double', types.int32),
+            ('boxmuller_extra', types.float32),
+            ('boxmuller_extra_double', types.float64),
+        ]
+        super().__init__(dmm, fe_type, members)
+
+
+register_model(CurandStatePointer)(models.PointerModel)
+
+
+# Numba forward declarations of cuRAND functions. These call shim functions
+# prepended with _numba, that simply forward arguments to the named cuRAND
+# function.
+
+curand_init_sig = types.void(
+    types.uint64,
+    types.uint64,
+    types.uint64,
+    curand_state_pointer,
+    types.uint64
+)
+
+curand_init = cuda.declare_device('_numba_curand_init', curand_init_sig)
+curand = cuda.declare_device('_numba_curand',
+                             types.uint32(curand_state_pointer, types.uint64))
+
+
+# Argument handling. When a CurandStatePointer is passed into a kernel, we
+# really only need to pass the pointer to the data, not the whole underlying
+# array structure. Our handler here transforms these arguments into a uint64
+# holding the pointer.
+
+class CurandStateArgHandler:
+    def prepare_args(self, ty, val, **kwargs):
+        if isinstance(val, CurandStates):
+            assert ty == curand_state_pointer
+            return types.uint64, val.data
+        else:
+            return ty, val
+
+
+curand_state_arg_handler = CurandStateArgHandler()
diff --git a/examples/cuda/curand/shim.cu b/examples/cuda/curand/shim.cu
@@ -0,0 +1,37 @@
+// Shim functions for calling cuRAND from Numba functions.
+//
+// Numba's ABI expects that:
+//
+// - The return value is used to indicate whether a Python exception occurred
+//   during function execution. This does not happen in C/C++ kernels, so we
+//   always return 0.
+// - The result returned to Numba is passed as a pointer in the first parameter.
+//   For void functions (such as curand_init()), a parameter is passed, but is
+//   unused.
+
+#include <curand_kernel.h>
+
+extern "C"
+__device__ int _numba_curand_init(
+    int* numba_return_value,
+    unsigned long long seed,
+    unsigned long long sequence,
+    unsigned long long offset,
+    curandState *state,
+    unsigned long long index)
+{
+    curand_init(seed, sequence, offset, &state[index]);
+
+    return 0;
+}
+
+extern "C"
+__device__ unsigned int _numba_curand(
+    int* numba_return_value,
+    curandState *states,
+    unsigned long long index)
+{
+  *numba_return_value = curand(&states[index]);
+
+  return 0;
+}