rapidsai · cjnolet · Aug 7, 2023 · Aug 10, 2023 · Aug 15, 2023 · Aug 15, 2023
@@ -155,9 +155,9 @@ void build_dendrogram_host(raft::resources const& handle,
 }
 
 template <typename value_idx>
-__global__ void write_levels_kernel(const value_idx* children,
-                                    value_idx* parents,
-                                    value_idx n_vertices)
+_RAFT_KERNEL void write_levels_kernel(const value_idx* children,
+                                      value_idx* parents,
+                                      value_idx n_vertices)
 {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < n_vertices) {
@@ -179,12 +179,12 @@ __global__ void write_levels_kernel(const value_idx* children,
  * @param labels
  */
 template <typename value_idx>
-__global__ void inherit_labels(const value_idx* children,
-                               const value_idx* levels,
-                               std::size_t n_leaves,
-                               value_idx* labels,
-                               int cut_level,
-                               value_idx n_vertices)
+_RAFT_KERNEL void inherit_labels(const value_idx* children,
+                                 const value_idx* levels,
+                                 std::size_t n_leaves,
+                                 value_idx* labels,
+                                 int cut_level,
+                                 value_idx n_vertices)
 {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
 

@@ -107,7 +107,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
 };
 
 template <typename value_idx>
-__global__ void fill_indices2(value_idx* indices, size_t m, size_t nnz)
+_RAFT_KERNEL void fill_indices2(value_idx* indices, size_t m, size_t nnz)
 {
   value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nnz) return;

@@ -434,7 +434,7 @@ template <uint32_t BlockDimY,
           typename LabelT,
           typename CounterT,
           typename MappingOpT>
-__global__ void __launch_bounds__((WarpSize * BlockDimY))
+_RAFT_KERNEL void __launch_bounds__((WarpSize * BlockDimY))
   adjust_centers_kernel(MathT* centers,  // [n_clusters, dim]
                         IdxT n_clusters,
                         IdxT dim,

@@ -92,12 +92,12 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
  *    initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void computeDistances(index_type_t n,
-                                        index_type_t d,
-                                        index_type_t k,
-                                        const value_type_t* __restrict__ obs,
-                                        const value_type_t* __restrict__ centroids,
-                                        value_type_t* __restrict__ dists)
+static _RAFT_KERNEL void computeDistances(index_type_t n,
+                                          index_type_t d,
+                                          index_type_t k,
+                                          const value_type_t* __restrict__ obs,
+                                          const value_type_t* __restrict__ centroids,
+                                          value_type_t* __restrict__ dists)
 {
   // Loop index
   index_type_t i;
@@ -173,11 +173,11 @@ static __global__ void computeDistances(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances(index_type_t n,
-                                    index_type_t k,
-                                    value_type_t* __restrict__ dists,
-                                    index_type_t* __restrict__ codes,
-                                    index_type_t* __restrict__ clusterSizes)
+static _RAFT_KERNEL void minDistances(index_type_t n,
+                                      index_type_t k,
+                                      value_type_t* __restrict__ dists,
+                                      index_type_t* __restrict__ codes,
+                                      index_type_t* __restrict__ clusterSizes)
 {
   // Loop index
   index_type_t i, j;
@@ -233,11 +233,11 @@ static __global__ void minDistances(index_type_t n,
  *  @param code_new Index associated with new centroid.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances2(index_type_t n,
-                                     value_type_t* __restrict__ dists_old,
-                                     const value_type_t* __restrict__ dists_new,
-                                     index_type_t* __restrict__ codes_old,
-                                     index_type_t code_new)
+static _RAFT_KERNEL void minDistances2(index_type_t n,
+                                       value_type_t* __restrict__ dists_old,
+                                       const value_type_t* __restrict__ dists_new,
+                                       index_type_t* __restrict__ codes_old,
+                                       index_type_t code_new)
 {
   // Loop index
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
@@ -275,9 +275,9 @@ static __global__ void minDistances2(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t>
-static __global__ void computeClusterSizes(index_type_t n,
-                                           const index_type_t* __restrict__ codes,
-                                           index_type_t* __restrict__ clusterSizes)
+static _RAFT_KERNEL void computeClusterSizes(index_type_t n,
+                                             const index_type_t* __restrict__ codes,
+                                             index_type_t* __restrict__ clusterSizes)
 {
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
@@ -308,10 +308,10 @@ static __global__ void computeClusterSizes(index_type_t n,
  *    column is the mean position of a cluster).
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void divideCentroids(index_type_t d,
-                                       index_type_t k,
-                                       const index_type_t* __restrict__ clusterSizes,
-                                       value_type_t* __restrict__ centroids)
+static _RAFT_KERNEL void divideCentroids(index_type_t d,
+                                         index_type_t k,
+                                         const index_type_t* __restrict__ clusterSizes,
+                                         value_type_t* __restrict__ centroids)
 {
   // Global indices
   index_type_t gidx, gidy;

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 namespace raft::detail {
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+_RAFT_KERNEL void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
 {
   typedef TxN_t<DataT, VecLen> DataVec;
   typedef TxN_t<IdxT, VecLen> IdxVec;

@@ -52,6 +52,14 @@
 #define RAFT_DEVICE_INLINE_FUNCTION _RAFT_DEVICE _RAFT_FORCEINLINE
 #endif
 
+#if defined(_RAFT_HAS_CUDA)
+#define _RAFT_KERNEL static __global__
+#else
+#define _RAFT_KERNEL
+#endif
+
+#define _RAFT_FUNC
+
 // The RAFT_INLINE_CONDITIONAL is a conditional inline specifier that removes
 // the inline specification when RAFT_COMPILED is defined.
 //

@@ -35,7 +35,7 @@ namespace raft::distance::detail {
  *                          Note: the division (`/`) is a ceilDiv.
  */
 template <typename T = uint64_t, typename = std::enable_if_t<std::is_integral<T>::value>>
-__global__ void compress_to_bits_kernel(
+_RAFT_KERNEL void compress_to_bits_kernel(
   raft::device_matrix_view<const bool, int, raft::layout_c_contiguous> in,
   raft::device_matrix_view<T, int, raft::layout_c_contiguous> out)
 {

@@ -87,7 +87,7 @@ struct MinReduceOpImpl {
 };
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+_RAFT_KERNEL void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
 {
   auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
   if (tid < m) { redOp.init(min + tid, maxVal); }
@@ -139,20 +139,20 @@ template <typename DataT,
           typename KVPReduceOpT,
           typename OpT,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
-                                                                  const DataT* x,
-                                                                  const DataT* y,
-                                                                  const DataT* xn,
-                                                                  const DataT* yn,
-                                                                  IdxT m,
-                                                                  IdxT n,
-                                                                  IdxT k,
-                                                                  DataT maxVal,
-                                                                  int* mutex,
-                                                                  ReduceOpT redOp,
-                                                                  KVPReduceOpT pairRedOp,
-                                                                  OpT distance_op,
-                                                                  FinalLambda fin_op)
+_RAFT_KERNEL __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
+                                                                    const DataT* x,
+                                                                    const DataT* y,
+                                                                    const DataT* xn,
+                                                                    const DataT* yn,
+                                                                    IdxT m,
+                                                                    IdxT n,
+                                                                    IdxT k,
+                                                                    DataT maxVal,
+                                                                    int* mutex,
+                                                                    ReduceOpT redOp,
+                                                                    KVPReduceOpT pairRedOp,
+                                                                    OpT distance_op,
+                                                                    FinalLambda fin_op)
 {
 // compile only if below non-ampere arch.
 #if __CUDA_ARCH__ < 800

@@ -36,7 +36,7 @@ namespace raft::distance::kernels::detail {
  * @param offset
  */
 template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel_nopad(
+_RAFT_KERNEL void polynomial_kernel_nopad(
   math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
 {
   for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
@@ -56,7 +56,7 @@ __global__ void polynomial_kernel_nopad(
  * @param offset
  */
 template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel(
+_RAFT_KERNEL void polynomial_kernel(
   math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
@@ -75,7 +75,7 @@ __global__ void polynomial_kernel(
  * @param offset
  */
 template <typename math_t>
-__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+_RAFT_KERNEL void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
 {
   for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
        tid += blockDim.x * gridDim.x) {
@@ -93,7 +93,7 @@ __global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t
  * @param offset
  */
 template <typename math_t>
-__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+_RAFT_KERNEL void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
        tidy += blockDim.y * gridDim.y)
@@ -121,7 +121,7 @@ __global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t ga
  * @param gain
  */
 template <typename math_t>
-__global__ void rbf_kernel_expanded(
+_RAFT_KERNEL void rbf_kernel_expanded(
   math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
 {
   for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;

@@ -40,24 +40,24 @@ template <typename DataT,
           typename KVPReduceOpT,
           typename CoreLambda,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void masked_l2_nn_kernel(OutT* min,
-                                                                      const DataT* x,
-                                                                      const DataT* y,
-                                                                      const DataT* xn,
-                                                                      const DataT* yn,
-                                                                      const uint64_t* adj,
-                                                                      const IdxT* group_idxs,
-                                                                      IdxT num_groups,
-                                                                      IdxT m,
-                                                                      IdxT n,
-                                                                      IdxT k,
-                                                                      bool sqrt,
-                                                                      DataT maxVal,
-                                                                      int* mutex,
-                                                                      ReduceOpT redOp,
-                                                                      KVPReduceOpT pairRedOp,
-                                                                      CoreLambda core_op,
-                                                                      FinalLambda fin_op)
+_RAFT_KERNEL __launch_bounds__(P::Nthreads, 2) void masked_l2_nn_kernel(OutT* min,
+                                                                        const DataT* x,
+                                                                        const DataT* y,
+                                                                        const DataT* xn,
+                                                                        const DataT* yn,
+                                                                        const uint64_t* adj,
+                                                                        const IdxT* group_idxs,
+                                                                        IdxT num_groups,
+                                                                        IdxT m,
+                                                                        IdxT n,
+                                                                        IdxT k,
+                                                                        bool sqrt,
+                                                                        DataT maxVal,
+                                                                        int* mutex,
+                                                                        ReduceOpT redOp,
+                                                                        KVPReduceOpT pairRedOp,
+                                                                        CoreLambda core_op,
+                                                                        FinalLambda fin_op)
 {
   extern __shared__ char smem[];
 

@@ -31,7 +31,7 @@ template <typename Policy,
           typename DataT,
           typename OutT,
           typename FinOpT>
-__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(
+_RAFT_KERNEL __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(
   OpT distance_op, pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params)
 {
   // Early exit to minimize the size of the kernel when it is not supposed to be compiled.

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -119,13 +119,13 @@ void getOvrlabels(
 // +/-1, return array with the new class labels and corresponding indices.
 
 template <typename Type, int TPB_X, typename Lambda>
-__global__ void map_label_kernel(Type* map_ids,
-                                 size_t N_labels,
-                                 Type* in,
-                                 Type* out,
-                                 size_t N,
-                                 Lambda filter_op,
-                                 bool zero_based = false)
+_RAFT_KERNEL void map_label_kernel(Type* map_ids,
+                                   size_t N_labels,
+                                   Type* in,
+                                   Type* out,
+                                   size_t N,
+                                   Lambda filter_op,
+                                   bool zero_based = false)
 {
   int tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ namespace detail {
  *  For an additional cost we can build the graph with edges
  *  E={(A[i], B[i]) | M[i]=1} and make this step faster */
 template <typename value_idx, int TPB_X = 256>
-__global__ void __launch_bounds__(TPB_X)
+_RAFT_KERNEL void __launch_bounds__(TPB_X)
   propagate_label_kernel(const value_idx* __restrict__ labels_a,
                          const value_idx* __restrict__ labels_b,
                          value_idx* __restrict__ R,
@@ -65,7 +65,7 @@ __global__ void __launch_bounds__(TPB_X)
 }
 
 template <typename value_idx, int TPB_X = 256>
-__global__ void __launch_bounds__(TPB_X)
+_RAFT_KERNEL void __launch_bounds__(TPB_X)
   reassign_label_kernel(value_idx* __restrict__ labels_a,
                         const value_idx* __restrict__ labels_b,
                         const value_idx* __restrict__ R,

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,10 +38,10 @@ void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t st
 }
 
 template <class InT, typename IdxType, typename OutT = InT>
-__global__ void add_dev_scalar_kernel(OutT* outDev,
-                                      const InT* inDev,
-                                      const InT* singleScalarDev,
-                                      IdxType len)
+_RAFT_KERNEL void add_dev_scalar_kernel(OutT* outDev,
+                                        const InT* inDev,
+                                        const InT* singleScalarDev,
+                                        IdxType len)
 {
   IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
   if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }