Use CUB's new CDP macros.

NVIDIA · May 17, 2022 · 3e07bf5 · 3e07bf5
1 parent 71fab9e
commit 3e07bf5
Show file tree

Hide file tree

Showing 25 changed files with 1,025 additions and 2,042 deletions.
diff --git a/dependencies/cub b/dependencies/cub
diff --git a/thrust/system/cuda/config.h b/thrust/system/cuda/config.h
@@ -32,29 +32,34 @@
 // older releases. This header will always pull in version info:
 #include <cub/util_namespace.cuh>
 
-#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
-#  if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
-#    define __THRUST_HAS_CUDART__ 1
-#    define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
-#  else
-#    define __THRUST_HAS_CUDART__ 0
-#    define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
-#  endif
-#else
-#  define __THRUST_HAS_CUDART__ 0
-#  define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
-#endif
-
 // These definitions were intended for internal use only and are now obsolete.
 // If you relied on them, consider porting your code to use the functionality
-// in libcu++'s <nv/target> header.
+// in libcu++'s <nv/target> header (for `THRUST_DEVICE_CODE`) or the new CDP
+// macros in CUB's <cub/detail/detect_cuda_runtime.cuh> header (for
+// `__THRUST_HAS_CUDART__` and `THRUST_RUNTIME_FUNCTION`).
+//
 // For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
 // them available again. These should be considered deprecated and will be
 // fully removed in a future version.
 #ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
-#ifdef __CUDA_ARCH__
-#define THRUST_DEVICE_CODE
-#endif // __CUDA_ARCH__
+
+#  if defined(__CUDACC__) || defined(_NVHPC_CUDA)
+#    if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
+#      define __THRUST_HAS_CUDART__ 1
+#      define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
+#    else
+#      define __THRUST_HAS_CUDART__ 0
+#      define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
+#    endif
+#  else
+#    define __THRUST_HAS_CUDART__ 0
+#    define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
+#  endif
+
+#  ifdef __CUDA_ARCH__
+#    define THRUST_DEVICE_CODE
+#  endif // __CUDA_ARCH__
+
 #endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS
 
 #ifdef THRUST_AGENT_ENTRY_NOINLINE

diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h
@@ -29,12 +29,14 @@
 #include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
 #include <thrust/detail/cstdint.h>
 #include <thrust/detail/minmax.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/functional.h>
 #include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/dispatch.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
 #include <thrust/system/cuda/detail/util.h>
@@ -64,7 +66,7 @@ namespace __adjacent_difference {
             class InputIt,
             class OutputIt,
             class BinaryOp>
-  cudaError_t THRUST_RUNTIME_FUNCTION
+  cudaError_t CUB_CDP_FUNCTION
   doit_step(void *d_temp_storage,
             size_t &temp_storage_bytes,
             InputIt first,
@@ -114,7 +116,7 @@ namespace __adjacent_difference {
   template <class InputIt,
             class OutputIt,
             class BinaryOp>
-  cudaError_t THRUST_RUNTIME_FUNCTION
+  cudaError_t CUB_CDP_FUNCTION
   doit_step(void *d_temp_storage,
             size_t &temp_storage_bytes,
             InputIt first,
@@ -139,7 +141,7 @@ namespace __adjacent_difference {
   template <class InputIt,
             class OutputIt,
             class BinaryOp>
-  cudaError_t THRUST_RUNTIME_FUNCTION
+  cudaError_t CUB_CDP_FUNCTION
   doit_step(void *d_temp_storage,
             size_t &temp_storage_bytes,
             InputIt first,
@@ -181,7 +183,7 @@ namespace __adjacent_difference {
             typename InputIt,
             typename OutputIt,
             typename BinaryOp>
-  OutputIt THRUST_RUNTIME_FUNCTION
+  OutputIt CUB_CDP_FUNCTION
   adjacent_difference(execution_policy<Derived>& policy,
                       InputIt                    first,
                       InputIt                    last,
@@ -260,27 +262,18 @@ adjacent_difference(execution_policy<Derived> &policy,
                     OutputIt                   result,
                     BinaryOp                   binary_op)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __adjacent_difference::adjacent_difference(policy,
-        first,
-        last,
-        result,
-        binary_op);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
-                                      first,
-                                      last,
-                                      result,
-                                      binary_op);
-#endif
-  }
-
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (result = __adjacent_difference::adjacent_difference(policy,
+                                                         first,
+                                                         last,
+                                                         result,
+                                                         binary_op);),
+    (result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
+                                          first,
+                                          last,
+                                          result,
+                                          binary_op);));
+  return result;
 }
 
 template <class Derived,

diff --git a/thrust/system/cuda/detail/cdp_dispatch.h b/thrust/system/cuda/detail/cdp_dispatch.h
@@ -0,0 +1,72 @@
+/*
+*  Copyright 2021-2022 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
+
+/**
+ * \file
+ * Utilities for CUDA dynamic parallelism.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+#include <nv/target>
+
+/**
+ * \def THRUST_CDP_DISPATCH
+ *
+ * If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always
+ * run the parallel implementation. Otherwise, run the parallel implementation
+ * when called from the host, and fallback to the sequential implementation on
+ * the device.
+ *
+ * `par_impl` and `seq_impl` are blocks of C++ statements enclosed in
+ * parentheses, similar to NV_IF_TARGET blocks:
+ *
+ * \code
+ * THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();));
+ * \endcode
+ */
+
+#ifdef CUB_CDP_ENABLED
+
+// seq_impl unused.
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  NV_IF_TARGET(NV_ANY_TARGET, par_impl)
+
+#else // CUB_CDP_ENABLED
+
+// Special case for NVCC -- need to inform the device path about the kernels
+// that are launched from the host path.
+#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
+
+// Device-side launch not supported, fallback to sequential in device code.
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  if (false)                                                                   \
+  { /* Without this, the device pass won't compile any kernels. */             \
+    NV_IF_TARGET(NV_ANY_TARGET, par_impl);                                     \
+  }                                                                            \
+  NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
+
+#else // NVCC device pass
+
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
+
+#endif // NVCC device pass
+
+#endif // CUB_CDP_ENABLED
diff --git a/thrust/system/cuda/detail/copy.h b/thrust/system/cuda/detail/copy.h
@@ -28,7 +28,10 @@
 
 #include <thrust/detail/config.h>
 
+#include <thrust/advance.h>
+
 #include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/system/cuda/detail/cross_system.h>
 
@@ -117,22 +120,11 @@ copy(execution_policy<System> &system,
      InputIterator             last,
      OutputIterator            result)
 {
-  OutputIterator ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __copy::device_to_device(system, first, last, result);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::copy(cvt_to_seq(derived_cast(system)),
-                       first,
-                       last,
-                       result);
-#endif
-  }
-
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (result = __copy::device_to_device(system, first, last, result);),
+    (result =
+       thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);));
+  return result;
 }    // end copy()
 
 __thrust_exec_check_disable__
@@ -146,19 +138,14 @@ copy_n(execution_policy<System> &system,
        Size                      n,
        OutputIterator            result)
 {
-  OutputIterator ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __copy::device_to_device(system, first, first + n, result);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);
-#endif
-  }
-
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (result = __copy::device_to_device(system,
+                                       first,
+                                       thrust::next(first, n),
+                                       result);),
+    (result =
+       thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);));
+  return result;
 } // end copy_n()
 #endif
+74 −0		cub/detail/detect_cuda_runtime.cuh
+2 −8		cub/detail/device_synchronize.cuh
+2 −2		cub/detail/type_traits.cuh
+5 −5		cub/device/device_adjacent_difference.cuh
+8 −8		cub/device/device_histogram.cuh
+6 −6		cub/device/device_merge_sort.cuh
+9 −9		cub/device/device_partition.cuh
+8 −8		cub/device/device_radix_sort.cuh
+8 −8		cub/device/device_reduce.cuh
+2 −2		cub/device/device_run_length_encode.cuh
+15 −15		cub/device/device_scan.cuh
+8 −8		cub/device/device_segmented_radix_sort.cuh
+7 −7		cub/device/device_segmented_reduce.cuh
+16 −16		cub/device/device_segmented_sort.cuh
+6 −6		cub/device/device_select.cuh
+1 −1		cub/device/device_spmv.cuh
+3 −3		cub/device/dispatch/dispatch_adjacent_difference.cuh
+10 −19		cub/device/dispatch/dispatch_histogram.cuh
+9 −9		cub/device/dispatch/dispatch_merge_sort.cuh
+16 −46		cub/device/dispatch/dispatch_radix_sort.cuh
+9 −36		cub/device/dispatch/dispatch_reduce.cuh
+4 −29		cub/device/dispatch/dispatch_reduce_by_key.cuh
+5 −15		cub/device/dispatch/dispatch_rle.cuh
+4 −16		cub/device/dispatch/dispatch_scan.cuh
+4 −15		cub/device/dispatch/dispatch_scan_by_key.cuh
+52 −47		cub/device/dispatch/dispatch_segmented_sort.cuh
+4 −28		cub/device/dispatch/dispatch_select_if.cuh
+4 −12		cub/device/dispatch/dispatch_spmv_orig.cuh
+4 −4		cub/device/dispatch/dispatch_three_way_partition.cuh
+9 −18		cub/device/dispatch/dispatch_unique_by_key.cuh
+26 −12		cub/util_arch.cuh
+3 −5		cub/util_debug.cuh
+14 −57		cub/util_device.cuh
+51 −13		test/CMakeLists.txt
+21 −9		test/README.md
+0 −39		test/test_allocator.cu
+34 −0		test/test_cdp_variant_state.cu
+4 −75		test/test_device_histogram.cu
+350 −156		test/test_device_radix_sort.cu
+305 −172		test/test_device_reduce.cu
+126 −84		test/test_device_reduce_by_key.cu
+129 −89		test/test_device_run_length_encode.cu
+111 −97		test/test_device_scan.cu
+128 −110		test/test_device_scan_by_key.cu
+12 −8		test/test_device_segmented_sort.cu
+125 −81		test/test_device_select_if.cu
+97 −69		test/test_device_select_unique.cu
+111 −74		test/test_device_select_unique_by_key.cu
+85 −15		test/test_util.h