From c2d526171b8d601b6139f6ab6f6e5b47e9572184 Mon Sep 17 00:00:00 2001 From: Allison Vacanti Date: Tue, 25 May 2021 13:17:35 -0400 Subject: [PATCH] Use CUB's new CDP macros. --- dependencies/cub | 2 +- thrust/system/cuda/config.h | 13 - .../system/cuda/detail/adjacent_difference.h | 56 +- thrust/system/cuda/detail/copy.h | 46 +- thrust/system/cuda/detail/copy_if.h | 103 +-- .../system/cuda/detail/core/agent_launcher.h | 154 ++-- .../cuda/detail/core/triple_chevron_launch.h | 866 +----------------- thrust/system/cuda/detail/core/util.h | 52 +- thrust/system/cuda/detail/extrema.h | 150 ++- .../detail/internal/copy_device_to_device.h | 2 +- thrust/system/cuda/detail/merge.h | 127 ++- thrust/system/cuda/detail/par.h | 8 +- thrust/system/cuda/detail/par_to_seq.h | 6 - thrust/system/cuda/detail/parallel_for.h | 33 +- thrust/system/cuda/detail/partition.h | 306 +++---- thrust/system/cuda/detail/reduce.h | 53 +- thrust/system/cuda/detail/reduce_by_key.h | 72 +- thrust/system/cuda/detail/scan.h | 69 +- thrust/system/cuda/detail/scan_by_key.h | 116 ++- thrust/system/cuda/detail/set_operations.h | 467 +++++----- thrust/system/cuda/detail/sort.h | 143 +-- thrust/system/cuda/detail/unique.h | 73 +- thrust/system/cuda/detail/unique_by_key.h | 97 +- thrust/system/cuda/detail/util.h | 18 +- 24 files changed, 962 insertions(+), 2070 deletions(-) diff --git a/dependencies/cub b/dependencies/cub index 3efed833c5..64c1c23b57 160000 --- a/dependencies/cub +++ b/dependencies/cub @@ -1 +1 @@ -Subproject commit 3efed833c58605c4e671933e4e4f57db47efac4d +Subproject commit 64c1c23b57db9bc3a2204542d36a1a5ac9277578 diff --git a/thrust/system/cuda/config.h b/thrust/system/cuda/config.h index e13366aa7c..8e29737d46 100644 --- a/thrust/system/cuda/config.h +++ b/thrust/system/cuda/config.h @@ -32,19 +32,6 @@ // older releases. This header will always pull in version info: #include -#if defined(__CUDACC__) || defined(_NVHPC_CUDA) -# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) -# define __THRUST_HAS_CUDART__ 1 -# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__ -# else -# define __THRUST_HAS_CUDART__ 0 -# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__ -# endif -#else -# define __THRUST_HAS_CUDART__ 0 -# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__ -#endif - #ifdef THRUST_AGENT_ENTRY_NOINLINE #define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__ #else diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h index fb0ce49f1d..2f63b10f07 100644 --- a/thrust/system/cuda/detail/adjacent_difference.h +++ b/thrust/system/cuda/detail/adjacent_difference.h @@ -29,21 +29,22 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include #include +#include +#include #include -#include -#include -#include +#include +#include +#include #include -#include #include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -362,7 +363,7 @@ namespace __adjacent_difference { class OutputIt, class BinaryOp, class Size> - cudaError_t THRUST_RUNTIME_FUNCTION + cudaError_t CUB_RUNTIME_FUNCTION doit_step(void * d_temp_storage, size_t & temp_storage_bytes, InputIt first, @@ -436,7 +437,7 @@ namespace __adjacent_difference { typename InputIt, typename OutputIt, typename BinaryOp> - OutputIt THRUST_RUNTIME_FUNCTION + OutputIt CUB_RUNTIME_FUNCTION adjacent_difference(execution_policy& policy, InputIt first, InputIt last, @@ -490,27 +491,18 @@ adjacent_difference(execution_policy &policy, OutputIt result, BinaryOp binary_op) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __adjacent_difference::adjacent_difference(policy, - first, - last, - result, - binary_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)), - first, - last, - result, - binary_op); -#endif - } - - return ret; + CUB_CDP_DISPATCH( + (result = __adjacent_difference::adjacent_difference(policy, + first, + last, + result, + binary_op);), + (result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)), + first, + last, + result, + binary_op);)); + return result; } template +#include + #include #include #include +#include + THRUST_NAMESPACE_BEGIN template @@ -117,22 +121,11 @@ copy(execution_policy &system, InputIterator last, OutputIterator result) { - OutputIterator ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __copy::device_to_device(system, first, last, result); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::copy(cvt_to_seq(derived_cast(system)), - first, - last, - result); -#endif - } - - return ret; + CUB_CDP_DISPATCH( + (result = __copy::device_to_device(system, first, last, result);), + (result = + thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);)); + return result; } // end copy() __thrust_exec_check_disable__ @@ -146,19 +139,14 @@ copy_n(execution_policy &system, Size n, OutputIterator result) { - OutputIterator ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __copy::device_to_device(system, first, first + n, result); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result); -#endif - } - - return ret; + CUB_CDP_DISPATCH( + (result = __copy::device_to_device(system, + first, + thrust::next(first, n), + result);), + (result = + thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);)); + return result; } // end copy_n() #endif diff --git a/thrust/system/cuda/detail/copy_if.h b/thrust/system/cuda/detail/copy_if.h index cd20b296ad..b832c6aff2 100644 --- a/thrust/system/cuda/detail/copy_if.h +++ b/thrust/system/cuda/detail/copy_if.h @@ -29,19 +29,20 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include #include +#include #include -#include -#include +#include +#include #include #include #include -#include -#include -#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -598,17 +599,17 @@ namespace __copy_if { class Predicate, class Size, class NumSelectedOutIt> - static cudaError_t THRUST_RUNTIME_FUNCTION - doit_step(void * d_temp_storage, - size_t & temp_storage_bytes, - ItemsIt items, - StencilIt stencil, - OutputIt output_it, - Predicate predicate, - NumSelectedOutIt num_selected_out, - Size num_items, - cudaStream_t stream, - bool debug_sync) + CUB_RUNTIME_FUNCTION + static cudaError_t doit_step(void * d_temp_storage, + size_t & temp_storage_bytes, + ItemsIt items, + StencilIt stencil, + OutputIt output_it, + Predicate predicate, + NumSelectedOutIt num_selected_out, + Size num_items, + cudaStream_t stream, + bool debug_sync) { if (num_items == 0) return cudaSuccess; @@ -695,7 +696,7 @@ namespace __copy_if { typename StencilIt, typename OutputIt, typename Predicate> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION OutputIt copy_if(execution_policy& policy, InputIt first, InputIt last, @@ -789,28 +790,18 @@ copy_if(execution_policy &policy, OutputIterator result, Predicate pred) { - OutputIterator ret = result; - - if (__THRUST_HAS_CUDART__) - { - ret = __copy_if::copy_if(policy, - first, - last, - __copy_if::no_stencil_tag(), - result, - pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::copy_if(cvt_to_seq(derived_cast(policy)), - first, - last, - result, - pred); -#endif - } - return ret; + CUB_CDP_DISPATCH((result = __copy_if::copy_if(policy, + first, + last, + __copy_if::no_stencil_tag(), + result, + pred);), + (result = thrust::copy_if(cvt_to_seq(derived_cast(policy)), + first, + last, + result, + pred);)); + return result; } // func copy_if __thrust_exec_check_disable__ @@ -827,29 +818,15 @@ copy_if(execution_policy &policy, OutputIterator result, Predicate pred) { - OutputIterator ret = result; - - if (__THRUST_HAS_CUDART__) - { - ret = __copy_if::copy_if(policy, - first, - last, - stencil, - result, - pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::copy_if(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - result, - pred); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (result = __copy_if::copy_if(policy, first, last, stencil, result, pred);), + (result = thrust::copy_if(cvt_to_seq(derived_cast(policy)), + first, + last, + stencil, + result, + pred);)); + return result; } // func copy_if } // namespace cuda_cub diff --git a/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/system/cuda/detail/core/agent_launcher.h index 4cdd7ff469..4157513d90 100644 --- a/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/system/cuda/detail/core/agent_launcher.h @@ -393,7 +393,7 @@ namespace core { MAX_SHMEM_PER_BLOCK> shm1; template - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentLauncher(AgentPlan plan_, Size count_, cudaStream_t stream_, @@ -413,7 +413,7 @@ namespace core { } template - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentLauncher(AgentPlan plan_, Size count_, cudaStream_t stream_, @@ -433,7 +433,7 @@ namespace core { assert(count > 0); } - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentLauncher(AgentPlan plan_, cudaStream_t stream_, char const* name_, @@ -451,7 +451,7 @@ namespace core { assert(plan.grid_size > 0); } - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentLauncher(AgentPlan plan_, cudaStream_t stream_, char* vshmem, @@ -471,7 +471,7 @@ namespace core { } #if 0 - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0) { // in separable compilation mode, we have no choice @@ -489,14 +489,14 @@ namespace core { return get_agent_plan(ptx_version); #endif } - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentPlan static get_plan_default() { return get_agent_plan(sm_arch<0>::type::ver); } #endif - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION typename core::get_plan::type static get_plan(cudaStream_t , void* d_ptr = 0) { THRUST_UNUSED_VAR(d_ptr); @@ -504,13 +504,13 @@ namespace core { return get_agent_plan(ptx_version); } - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION typename core::get_plan::type static get_plan() { return get_agent_plan(lowest_supported_sm_arch::ver); } - THRUST_RUNTIME_FUNCTION void sync() const + CUB_RUNTIME_FUNCTION void sync() const { if (debug_sync) { @@ -521,7 +521,7 @@ namespace core { } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION max_blocks_per_sm_impl(K k, int block_threads) { int occ; @@ -530,16 +530,14 @@ namespace core { } template - cuda_optional THRUST_RUNTIME_FUNCTION + cuda_optional CUB_RUNTIME_FUNCTION max_sm_occupancy(K k) const { return max_blocks_per_sm_impl(k, plan.block_threads); } - - template - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION void print_info(K k) const { if (debug_sync) @@ -582,112 +580,112 @@ namespace core { #if 0 template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { return max_blocks_per_sm_impl(_kernel_agent, plan.block_threads); } #else template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0, _1) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD) = _kernel_agent; return max_blocks_per_sm_impl(ptr, plan.block_threads); } template - static cuda_optional THRUST_RUNTIME_FUNCTION + static cuda_optional CUB_RUNTIME_FUNCTION get_max_blocks_per_sm(AgentPlan plan) { void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE) = _kernel_agent; @@ -703,7 +701,7 @@ namespace core { // don't compile other kernel which accepts pointer // and save on compilations template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, Args... args) const { assert(has_shmem && vshmem == NULL); @@ -721,7 +719,7 @@ namespace core { // do actually have enough shared memory, the compilation time will double. // template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, Args... args) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -731,7 +729,7 @@ namespace core { } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(Args... args) const { launch_impl(has_enough_shmem_t(),args...); @@ -739,7 +737,7 @@ namespace core { } #else template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -749,7 +747,7 @@ namespace core { .doit(ptr, vshmem, x0); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -759,7 +757,7 @@ namespace core { .doit(ptr, vshmem, x0, x1); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -769,7 +767,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -779,7 +777,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -789,7 +787,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -799,7 +797,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -809,7 +807,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -819,7 +817,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -828,7 +826,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -838,7 +836,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -848,7 +846,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -858,7 +856,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -868,7 +866,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC,_xD xD) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -878,7 +876,7 @@ namespace core { .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC,_xD xD,_xE xE) const { assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0)); @@ -893,7 +891,7 @@ namespace core { //////////////////////////////////////////////////////// template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0) const { assert(has_shmem && vshmem == NULL); @@ -903,7 +901,7 @@ namespace core { .doit(ptr, x0); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1) const { assert(has_shmem && vshmem == NULL); @@ -913,7 +911,7 @@ namespace core { .doit(ptr, x0, x1); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2) const { assert(has_shmem && vshmem == NULL); @@ -923,7 +921,7 @@ namespace core { .doit(ptr, x0, x1, x2); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3) const { assert(has_shmem && vshmem == NULL); @@ -933,7 +931,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const { assert(has_shmem && vshmem == NULL); @@ -943,7 +941,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const { assert(has_shmem && vshmem == NULL); @@ -953,7 +951,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const { assert(has_shmem && vshmem == NULL); @@ -963,7 +961,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5, x6); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const { assert(has_shmem && vshmem == NULL); @@ -973,7 +971,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const { assert(has_shmem && vshmem == NULL); @@ -983,7 +981,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const { assert(has_shmem && vshmem == NULL); @@ -993,7 +991,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const { assert(has_shmem && vshmem == NULL); @@ -1003,7 +1001,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const { assert(has_shmem && vshmem == NULL); @@ -1013,7 +1011,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const { assert(has_shmem && vshmem == NULL); @@ -1023,7 +1021,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const { assert(has_shmem && vshmem == NULL); @@ -1033,7 +1031,7 @@ namespace core { .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const { assert(has_shmem && vshmem == NULL); @@ -1048,105 +1046,105 @@ namespace core { //////////////////////////////////////////////////////// template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0) const { launch_impl(has_enough_shmem_t(), x0); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1) const { launch_impl(has_enough_shmem_t(), x0, x1); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2) const { launch_impl(has_enough_shmem_t(), x0, x1, x2); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); sync(); } template - void THRUST_RUNTIME_FUNCTION + void CUB_RUNTIME_FUNCTION launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const { launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE); diff --git a/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/system/cuda/detail/core/triple_chevron_launch.h index bf9955c6da..c439f17a05 100644 --- a/thrust/system/cuda/detail/core/triple_chevron_launch.h +++ b/thrust/system/cuda/detail/core/triple_chevron_launch.h @@ -45,7 +45,7 @@ namespace launcher { Size const shared_mem; cudaStream_t const stream; - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION triple_chevron(dim3 grid_, dim3 block_, Size shared_mem_ = 0, @@ -55,7 +55,6 @@ namespace launcher { shared_mem(shared_mem_), stream(stream_) {} -#if 0 template cudaError_t __host__ doit_host(K k, Args const&... args) const @@ -63,120 +62,6 @@ namespace launcher { k<<>>(args...); return cudaPeekAtLastError(); } -#else - template - cudaError_t __host__ - doit_host(K k, _0 x0) const - { - k<<>>(x0); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1) const - { - k<<>>(x0,x1); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2) const - { - k<<>>(x0,x1,x2); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3) const - { - k<<>>(x0,x1,x2,x3); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - k<<>>(x0,x1,x2,x3,x4); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - k<<>>(x0,x1,x2,x3,x4,x5); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE); - return cudaPeekAtLastError(); - } - template - cudaError_t __host__ - doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE, _xF xF) const - { - k<<>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF); - return cudaPeekAtLastError(); - } -#endif template size_t __device__ @@ -186,7 +71,6 @@ namespace launcher { return alignment * ((offset + (alignment - 1))/ alignment); } -#if 0 size_t __device__ argument_pack_size(size_t size) const { return size; } template size_t __device__ @@ -195,110 +79,6 @@ namespace launcher { size = align_up(size); return argument_pack_size(size + sizeof(Arg), args...); } -#else - template - size_t __device__ - argument_pack_size(size_t size, Arg) const - { - return align_up(size) + sizeof(Arg); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE); - } - template - size_t __device__ - argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const - { - return argument_pack_size(align_up(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF); - } -#endif /* variadic */ template size_t __device__ copy_arg(char* buffer, size_t offset, Arg arg) const @@ -309,664 +89,52 @@ namespace launcher { return offset + sizeof(Arg); } -#if 0 - void __device__ fill_arguments(char*, size_t) const {} + __device__ + void fill_arguments(char*, size_t) const + {} + template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg const& arg, Args const& ... args) const + __device__ + void fill_arguments(char* buffer, + size_t offset, + Arg const& arg, + Args const& ... args) const { fill_arguments(buffer, copy_arg(buffer, offset, arg), args...); } -#else - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg) const - { - copy_arg(buffer, offset, arg); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE); - } - template - void __device__ - fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const - { - fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF); - } -#endif /* variadic */ -#if 0 template cudaError_t __device__ doit_device(K k, Args const&... args) const { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ const size_t size = argument_pack_size(0,args...); void *param_buffer = cudaGetParameterBuffer(64,size); fill_arguments((char*)param_buffer, 0, args...); - status = launch_device(k, param_buffer); -#endif - return status; - } -#else - template - cudaError_t __device__ - doit_device(K k, _0 x0) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); -#endif - return status; + return launch_device(k, param_buffer); } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); - THRUST_UNUSED_VAR(xC); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); - THRUST_UNUSED_VAR(xC); - THRUST_UNUSED_VAR(xD); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); - THRUST_UNUSED_VAR(xC); - THRUST_UNUSED_VAR(xD); - THRUST_UNUSED_VAR(xE); -#endif - return status; - } - template - cudaError_t __device__ - doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const - { - cudaError_t status = cudaErrorNotSupported; -#if __THRUST_HAS_CUDART__ - const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF); - void *param_buffer = cudaGetParameterBuffer(64,size); - fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF); - status = launch_device(k, param_buffer); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(x0); - THRUST_UNUSED_VAR(x1); - THRUST_UNUSED_VAR(x2); - THRUST_UNUSED_VAR(x3); - THRUST_UNUSED_VAR(x4); - THRUST_UNUSED_VAR(x5); - THRUST_UNUSED_VAR(x6); - THRUST_UNUSED_VAR(x7); - THRUST_UNUSED_VAR(x8); - THRUST_UNUSED_VAR(x9); - THRUST_UNUSED_VAR(xA); - THRUST_UNUSED_VAR(xB); - THRUST_UNUSED_VAR(xC); - THRUST_UNUSED_VAR(xD); - THRUST_UNUSED_VAR(xE); - THRUST_UNUSED_VAR(xF); -#endif - return status; - } -#endif /* variadic */ template cudaError_t __device__ launch_device(K k, void* buffer) const { -#if __THRUST_HAS_CUDART__ return cudaLaunchDevice((void*)k, buffer, dim3(grid), dim3(block), shared_mem, stream); -#else - THRUST_UNUSED_VAR(k); - THRUST_UNUSED_VAR(buffer); - return cudaErrorNotSupported; -#endif } - -#if defined(_NVHPC_CUDA) -# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(...) \ - (__builtin_is_device_code() ? \ - doit_device(__VA_ARGS__) : doit_host(__VA_ARGS__)) -#elif defined(__CUDA_ARCH__) -# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_device -#else -# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_host -#endif - -#if 0 __thrust_exec_check_disable__ template - cudaError_t THRUST_FUNCTION - doit(K k, Args const&... args) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, args...); - } -#else - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const + THRUST_FUNCTION + cudaError_t doit(K k, Args const&... args) const { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA); + NV_IF_TARGET(NV_IS_HOST, + (return doit_host(k, args...);), + (return doit_device(k, args...);)); } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE); - } - __thrust_exec_check_disable__ - template - cudaError_t THRUST_FUNCTION - doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE, _xF xF) const - { - return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF); - } -#endif -#undef THRUST_TRIPLE_LAUNCHER_HOSTDEVICE + }; // struct triple_chevron } // namespace launcher diff --git a/thrust/system/cuda/detail/core/util.h b/thrust/system/cuda/detail/core/util.h index 28ae2c3a69..73c85008a5 100644 --- a/thrust/system/cuda/detail/core/util.h +++ b/thrust/system/cuda/detail/core/util.h @@ -263,10 +263,10 @@ namespace core { int shared_memory_size; int grid_size; - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentPlan() {} - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentPlan(int block_threads_, int items_per_thread_, int shared_memory_size_, @@ -279,7 +279,7 @@ namespace core { { } - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentPlan(AgentPlan const& plan) : block_threads(plan.block_threads), items_per_thread(plan.items_per_thread), @@ -288,7 +288,7 @@ namespace core { grid_size(plan.grid_size) {} template - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AgentPlan(PtxPlan, typename thrust::detail::disable_if_convertible< PtxPlan, @@ -329,7 +329,7 @@ namespace core { struct get_agent_plan_impl > { typedef typename get_plan::type Plan; - Plan THRUST_RUNTIME_FUNCTION + Plan CUB_RUNTIME_FUNCTION static get(int ptx_version) { if (ptx_version >= SM::ver) @@ -345,7 +345,7 @@ namespace core { struct get_agent_plan_impl > { typedef typename get_plan::type Plan; - Plan THRUST_RUNTIME_FUNCTION + Plan CUB_RUNTIME_FUNCTION static get(int /* ptx_version */) { typedef typename get_plan::type Plan; @@ -354,14 +354,9 @@ namespace core { }; template - typename get_plan::type THRUST_RUNTIME_FUNCTION - get_agent_plan(int ptx_version) + CUB_RUNTIME_FUNCTION + typename get_plan::type get_agent_plan(int ptx_version) { - // Use one path, with Agent::ptx_plan, for device code where device-side - // kernel launches are supported. The other path, with - // get_agent_plan_impl::get(version), is for host code and for device - // code without device-side kernel launches. -#ifdef __THRUST_HAS_CUDART__ NV_IF_TARGET( NV_IS_DEVICE, ( @@ -371,9 +366,6 @@ namespace core { return plan_type{ptx_plan{}}; ), // NV_IS_HOST: ( return get_agent_plan_impl::get(ptx_version); )); -#else - return get_agent_plan_impl::get(ptx_version); -#endif } // XXX keep this dead-code for now as a gentle reminder @@ -438,7 +430,7 @@ namespace core { } template - AgentPlan THRUST_RUNTIME_FUNCTION + AgentPlan CUB_RUNTIME_FUNCTION get_agent_plan(cudaStream_t s = 0, void *ptr = 0) { return xget_agent_plan_impl(get_agent_plan_kernel, @@ -457,8 +449,8 @@ namespace core { ///////////////////////// ///////////////////////// - THRUST_RUNTIME_FUNCTION - int get_sm_count() + CUB_RUNTIME_FUNCTION + inline int get_sm_count() { int dev_id; cuda_cub::throw_on_error(cudaGetDevice(&dev_id), @@ -476,8 +468,8 @@ namespace core { return i32value; } - size_t THRUST_RUNTIME_FUNCTION - get_max_shared_memory_per_block() + CUB_RUNTIME_FUNCTION + inline size_t get_max_shared_memory_per_block() { int dev_id; cuda_cub::throw_on_error(cudaGetDevice(&dev_id), @@ -496,8 +488,8 @@ namespace core { return static_cast(i32value); } - size_t THRUST_RUNTIME_FUNCTION - virtual_shmem_size(size_t shmem_per_block) + CUB_RUNTIME_FUNCTION + inline size_t virtual_shmem_size(size_t shmem_per_block) { size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); if (shmem_per_block > max_shmem_per_block) @@ -506,8 +498,8 @@ namespace core { return 0; } - size_t THRUST_RUNTIME_FUNCTION - vshmem_size(size_t shmem_per_block, size_t num_blocks) + CUB_RUNTIME_FUNCTION + inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks) { size_t max_shmem_per_block = core::get_max_shared_memory_per_block(); if (shmem_per_block > max_shmem_per_block) @@ -625,16 +617,16 @@ namespace core { __host__ __device__ operator T const &() const { return value_; } }; - cuda_optional THRUST_RUNTIME_FUNCTION - get_ptx_version() + CUB_RUNTIME_FUNCTION + inline cuda_optional get_ptx_version() { int ptx_version = 0; cudaError_t status = cub::PtxVersion(ptx_version); return cuda_optional(ptx_version, status); } - cudaError_t THRUST_RUNTIME_FUNCTION - sync_stream(cudaStream_t stream) + CUB_RUNTIME_FUNCTION + inline cudaError_t sync_stream(cudaStream_t stream) { return cub::SyncStream(stream); } @@ -743,7 +735,7 @@ namespace core { } template - THRUST_RUNTIME_FUNCTION cudaError_t + CUB_RUNTIME_FUNCTION cudaError_t alias_storage(void* storage_ptr, size_t& storage_size, void* (&allocations)[ALLOCATIONS], diff --git a/thrust/system/cuda/detail/extrema.h b/thrust/system/cuda/detail/extrema.h index 0519b7df31..7a502ccf70 100644 --- a/thrust/system/cuda/detail/extrema.h +++ b/thrust/system/cuda/detail/extrema.h @@ -29,15 +29,16 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include #include #include +#include #include #include -#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -152,7 +153,7 @@ namespace __extrema { class OutputIt, class Size, class ReductionOp> - cudaError_t THRUST_RUNTIME_FUNCTION + cudaError_t CUB_RUNTIME_FUNCTION doit_step(void * d_temp_storage, size_t & temp_storage_bytes, InputIt input_it, @@ -314,7 +315,7 @@ namespace __extrema { typename Size, typename BinaryOp, typename T> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION T extrema(execution_policy& policy, InputIt first, Size num_items, @@ -373,7 +374,7 @@ namespace __extrema { class Derived, class ItemsIt, class BinaryPred> - ItemsIt THRUST_RUNTIME_FUNCTION + ItemsIt CUB_RUNTIME_FUNCTION element(execution_policy &policy, ItemsIt first, ItemsIt last, @@ -421,24 +422,16 @@ min_element(execution_policy &policy, ItemsIt last, BinaryPred binary_pred) { - ItemsIt ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = __extrema::element<__extrema::arg_min_f>(policy, - first, - last, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::min_element(cvt_to_seq(derived_cast(policy)), - first, - last, - binary_pred); -#endif - } - return ret; + CUB_CDP_DISPATCH((last = + __extrema::element<__extrema::arg_min_f>(policy, + first, + last, + binary_pred);), + (last = thrust::min_element(cvt_to_seq(derived_cast(policy)), + first, + last, + binary_pred);)); + return last; } template &policy, ItemsIt last, BinaryPred binary_pred) { - ItemsIt ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = __extrema::element<__extrema::arg_max_f>(policy, - first, - last, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::max_element(cvt_to_seq(derived_cast(policy)), - first, - last, - binary_pred); -#endif - } - return ret; + CUB_CDP_DISPATCH((last = + __extrema::element<__extrema::arg_max_f>(policy, + first, + last, + binary_pred);), + (last = thrust::max_element(cvt_to_seq(derived_cast(policy)), + first, + last, + binary_pred);)); + return last; } template &policy, ItemsIt last, BinaryPred binary_pred) { - pair ret = thrust::make_pair(first, first); - - if (__THRUST_HAS_CUDART__) + auto ret = thrust::make_pair(last, last); + if (first == last) { - if (first == last) - return thrust::make_pair(last, last); - - typedef typename iterator_traits::value_type InputType; - typedef typename iterator_traits::difference_type IndexType; - - IndexType num_items = static_cast(thrust::distance(first, last)); - - - typedef tuple > iterator_tuple; - typedef zip_iterator zip_iterator; - - iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t(0)); - - - typedef __extrema::arg_minmax_f arg_minmax_t; - typedef typename arg_minmax_t::two_pairs_type two_pairs_type; - typedef typename arg_minmax_t::duplicate_tuple duplicate_t; - typedef transform_input_iterator_t - transform_t; - - zip_iterator begin = make_zip_iterator(iter_tuple); - two_pairs_type result = __extrema::extrema(policy, - transform_t(begin, duplicate_t()), - num_items, - arg_minmax_t(binary_pred), - (two_pairs_type *)(NULL)); - ret = thrust::make_pair(first + get<1>(get<0>(result)), - first + get<1>(get<1>(result))); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)), - first, - last, - binary_pred); -#endif + return ret; } + + CUB_CDP_DISPATCH( + (using InputType = typename iterator_traits::value_type; + using IndexType = typename iterator_traits::difference_type; + + const auto num_items = + static_cast(thrust::distance(first, last)); + + using iterator_tuple = tuple>; + using zip_iterator = zip_iterator; + + iterator_tuple iter_tuple = + thrust::make_tuple(first, counting_iterator_t(0)); + + using arg_minmax_t = + __extrema::arg_minmax_f; + using two_pairs_type = typename arg_minmax_t::two_pairs_type; + using duplicate_t = typename arg_minmax_t::duplicate_tuple; + using transform_t = + transform_input_iterator_t; + + zip_iterator begin = make_zip_iterator(iter_tuple); + two_pairs_type result = + __extrema::extrema(policy, + transform_t(begin, duplicate_t()), + num_items, + arg_minmax_t(binary_pred), + (two_pairs_type *)(NULL)); + ret = thrust::make_pair(first + get<1>(get<0>(result)), + first + get<1>(get<1>(result)));), + // CDP Sequential impl: + (ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)), + first, + last, + binary_pred);)); return ret; } diff --git a/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/system/cuda/detail/internal/copy_device_to_device.h index 69c4e20dfd..27a2046df6 100644 --- a/thrust/system/cuda/detail/internal/copy_device_to_device.h +++ b/thrust/system/cuda/detail/internal/copy_device_to_device.h @@ -43,7 +43,7 @@ namespace __copy { template - OutputIt THRUST_RUNTIME_FUNCTION + OutputIt CUB_RUNTIME_FUNCTION device_to_device(execution_policy& policy, InputIt first, InputIt last, diff --git a/thrust/system/cuda/detail/merge.h b/thrust/system/cuda/detail/merge.h index b8b17012b2..e5cacbfed2 100644 --- a/thrust/system/cuda/detail/merge.h +++ b/thrust/system/cuda/detail/merge.h @@ -29,21 +29,21 @@ j * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + #include #include -#include - -#include -#include +#include +#include +#include +#include +#include #include #include +#include +#include #include -#include -#include -#include -#include -#include +#include THRUST_NAMESPACE_BEGIN namespace cuda_cub { @@ -673,7 +673,7 @@ namespace __merge { class KeysOutputIt, class ItemsOutputIt, class CompareOp> - cudaError_t THRUST_RUNTIME_FUNCTION + cudaError_t CUB_RUNTIME_FUNCTION doit_step(void* d_temp_storage, size_t& temp_storage_bytes, KeysIt1 keys1, @@ -782,7 +782,7 @@ namespace __merge { typename KeysOutputIt, typename ItemsOutputIt, typename CompareOp> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION pair merge(execution_policy& policy, KeysIt1 keys1_first, @@ -876,38 +876,28 @@ merge(execution_policy& policy, CompareOp compare_op) { - ResultIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typedef typename thrust::iterator_value::type keys_type; - // - keys_type* null_ = NULL; - // - ret = __merge::merge(policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - null_, - null_, - result, - null_, - compare_op) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::merge(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - result, - compare_op); -#endif - } - return ret; + CUB_CDP_DISPATCH((using keys_type = thrust::iterator_value_t; + keys_type *null_ = nullptr; + auto tmp = + __merge::merge(policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + null_, + null_, + result, + null_, + compare_op); + result = tmp.first;), + (result = thrust::merge(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + result, + compare_op);)); + return result; } template @@ -950,35 +940,28 @@ merge_by_key(execution_policy &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - return __merge::merge(policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::merge_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + CUB_CDP_DISPATCH((ret = + __merge::merge(policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);), + (ret = thrust::merge_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);)); return ret; } diff --git a/thrust/system/cuda/detail/par.h b/thrust/system/cuda/detail/par.h index 42c701ca70..1948fa253c 100644 --- a/thrust/system/cuda/detail/par.h +++ b/thrust/system/cuda/detail/par.h @@ -53,7 +53,7 @@ struct execute_on_stream_base : execution_policy execute_on_stream_base(cudaStream_t stream_ = default_stream()) : stream(stream_){} - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION Derived on(cudaStream_t const &s) const { @@ -82,7 +82,7 @@ struct execute_on_stream_nosync_base : execution_policy execute_on_stream_nosync_base(cudaStream_t stream_ = default_stream()) : stream(stream_){} - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION Derived on(cudaStream_t const &s) const { @@ -145,7 +145,7 @@ struct par_t : execution_policy, typedef execute_on_stream stream_attachment_type; - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION stream_attachment_type on(cudaStream_t const &stream) const { @@ -168,7 +168,7 @@ struct par_nosync_t : execution_policy, typedef execute_on_stream_nosync stream_attachment_type; - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION stream_attachment_type on(cudaStream_t const &stream) const { diff --git a/thrust/system/cuda/detail/par_to_seq.h b/thrust/system/cuda/detail/par_to_seq.h index 833634982b..e710f017b6 100644 --- a/thrust/system/cuda/detail/par_to_seq.h +++ b/thrust/system/cuda/detail/par_to_seq.h @@ -82,11 +82,5 @@ cvt_to_seq(Policy& policy) return cvt_to_seq_impl::doit(policy); } -#if __THRUST_HAS_CUDART__ -#define THRUST_CUDART_DISPATCH par -#else -#define THRUST_CUDART_DISPATCH seq -#endif - } // namespace cuda_ THRUST_NAMESPACE_END diff --git a/thrust/system/cuda/detail/parallel_for.h b/thrust/system/cuda/detail/parallel_for.h index be4ff14a59..7bcfe7939e 100644 --- a/thrust/system/cuda/detail/parallel_for.h +++ b/thrust/system/cuda/detail/parallel_for.h @@ -29,13 +29,14 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include #include -#include +#include #include #include +#include + +#include THRUST_NAMESPACE_BEGIN @@ -122,7 +123,7 @@ namespace __parallel_for { template - THRUST_RUNTIME_FUNCTION cudaError_t + CUB_RUNTIME_FUNCTION cudaError_t parallel_for(Size num_items, F f, cudaStream_t stream) @@ -155,21 +156,19 @@ parallel_for(execution_policy &policy, Size count) { if (count == 0) - return; - - if (__THRUST_HAS_CUDART__) - { - cudaStream_t stream = cuda_cub::stream(policy); - cudaError_t status = __parallel_for::parallel_for(count, f, stream); - cuda_cub::throw_on_error(status, "parallel_for failed"); - } - else { -#if !__THRUST_HAS_CUDART__ - for (Size idx = 0; idx != count; ++idx) - f(idx); -#endif + return; } + + CUB_CDP_DISPATCH((cudaStream_t stream = cuda_cub::stream(policy); + cudaError_t status = + __parallel_for::parallel_for(count, f, stream); + cuda_cub::throw_on_error(status, "parallel_for failed");), + // CDP sequential impl: + (for (Size idx = 0; idx != count; ++idx) + { + f(idx); + })); } } // namespace cuda_cub diff --git a/thrust/system/cuda/detail/partition.h b/thrust/system/cuda/detail/partition.h index 85d9bb8136..3bd4b56ed9 100644 --- a/thrust/system/cuda/detail/partition.h +++ b/thrust/system/cuda/detail/partition.h @@ -29,21 +29,25 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include #include #include -#include -#include +#include +#include +#include +#include +#include #include +#include #include -#include -#include #include -#include -#include -#include +#include +#include // cub::ScanTileState +#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -608,7 +612,7 @@ namespace __partition { class Predicate, class Size, class NumSelectedOutIt> - static cudaError_t THRUST_RUNTIME_FUNCTION + static cudaError_t CUB_RUNTIME_FUNCTION doit_step(void * d_temp_storage, size_t & temp_storage_bytes, ItemsIt items, @@ -706,7 +710,7 @@ namespace __partition { typename SelectedOutIt, typename RejectedOutIt, typename Predicate> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION pair partition(execution_policy& policy, InputIt first, @@ -792,7 +796,7 @@ namespace __partition { typename Iterator, typename StencilIt, typename Predicate> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION Iterator partition_inplace(execution_policy& policy, Iterator first, Iterator last, @@ -846,29 +850,22 @@ partition_copy(execution_policy &policy, RejectedOutIt rejected_result, Predicate predicate) { - pair ret = thrust::make_pair(selected_result, rejected_result); - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition(policy, - first, - last, - stencil, - selected_result, - rejected_result, - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - selected_result, - rejected_result, - predicate); -#endif - } + auto ret = thrust::make_pair(selected_result, rejected_result); + CUB_CDP_DISPATCH((ret = __partition::partition(policy, + first, + last, + stencil, + selected_result, + rejected_result, + predicate);), + (ret = + thrust::partition_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + stencil, + selected_result, + rejected_result, + predicate);)); return ret; } @@ -886,28 +883,21 @@ partition_copy(execution_policy &policy, RejectedOutIt rejected_result, Predicate predicate) { - pair ret = thrust::make_pair(selected_result, rejected_result); - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition(policy, - first, - last, - __partition::no_stencil_tag(), - selected_result, - rejected_result, - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - selected_result, - rejected_result, - predicate); -#endif - } + auto ret = thrust::make_pair(selected_result, rejected_result); + CUB_CDP_DISPATCH((ret = __partition::partition(policy, + first, + last, + __partition::no_stencil_tag(), + selected_result, + rejected_result, + predicate);), + (ret = + thrust::partition_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + selected_result, + rejected_result, + predicate);)); return ret; } @@ -925,28 +915,21 @@ stable_partition_copy(execution_policy &policy, RejectedOutIt rejected_result, Predicate predicate) { - pair ret = thrust::make_pair(selected_result, rejected_result); - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition(policy, - first, - last, - __partition::no_stencil_tag(), - selected_result, - rejected_result, - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - selected_result, - rejected_result, - predicate); -#endif - } + auto ret = thrust::make_pair(selected_result, rejected_result); + CUB_CDP_DISPATCH( + (ret = __partition::partition(policy, + first, + last, + __partition::no_stencil_tag(), + selected_result, + rejected_result, + predicate);), + (ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + selected_result, + rejected_result, + predicate);)); return ret; } @@ -966,29 +949,22 @@ stable_partition_copy(execution_policy &policy, RejectedOutIt rejected_result, Predicate predicate) { - pair ret = thrust::make_pair(selected_result, rejected_result); - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition(policy, - first, - last, - stencil, - selected_result, - rejected_result, - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - selected_result, - rejected_result, - predicate); -#endif - } + auto ret = thrust::make_pair(selected_result, rejected_result); + CUB_CDP_DISPATCH( + (ret = __partition::partition(policy, + first, + last, + stencil, + selected_result, + rejected_result, + predicate);), + (ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + stencil, + selected_result, + rejected_result, + predicate);)); return ret; } @@ -1006,22 +982,17 @@ partition(execution_policy &policy, StencilIt stencil, Predicate predicate) { - Iterator ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition_inplace(policy, first, last, stencil, predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::partition(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - predicate); -#endif - } - return ret; + CUB_CDP_DISPATCH((last = __partition::partition_inplace(policy, + first, + last, + stencil, + predicate);), + (last = thrust::partition(cvt_to_seq(derived_cast(policy)), + first, + last, + stencil, + predicate);)); + return last; } __thrust_exec_check_disable__ @@ -1034,25 +1005,17 @@ partition(execution_policy &policy, Iterator last, Predicate predicate) { - Iterator ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = __partition::partition_inplace(policy, - first, - last, - __partition::no_stencil_tag(), - predicate); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::partition(cvt_to_seq(derived_cast(policy)), - first, - last, - predicate); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (last = __partition::partition_inplace(policy, + first, + last, + __partition::no_stencil_tag(), + predicate);), + (last = thrust::partition(cvt_to_seq(derived_cast(policy)), + first, + last, + predicate);)); + return last; } __thrust_exec_check_disable__ @@ -1067,30 +1030,20 @@ stable_partition(execution_policy &policy, StencilIt stencil, Predicate predicate) { - Iterator result = first; - if (__THRUST_HAS_CUDART__) - { - result = __partition::partition_inplace(policy, + auto ret = last; + CUB_CDP_DISPATCH( + (ret = + __partition::partition_inplace(policy, first, last, stencil, predicate); + + /* partition returns rejected values in reverse order + so reverse the rejected elements to make it stable */ + cuda_cub::reverse(policy, ret, last);), + (ret = thrust::stable_partition(cvt_to_seq(derived_cast(policy)), first, last, stencil, - predicate); - - // partition returns rejected values in reverese order - // so reverse the rejected elements to make it stable - cuda_cub::reverse(policy, result, last); - } - else - { -#if !__THRUST_HAS_CUDART__ - result = thrust::stable_partition(cvt_to_seq(derived_cast(policy)), - first, - last, - stencil, - predicate); -#endif - } - return result; + predicate);)); + return ret; } __thrust_exec_check_disable__ @@ -1103,29 +1056,22 @@ stable_partition(execution_policy &policy, Iterator last, Predicate predicate) { - Iterator result = first; - if (__THRUST_HAS_CUDART__) - { - result = __partition::partition_inplace(policy, - first, - last, - __partition::no_stencil_tag(), - predicate); - - // partition returns rejected values in reverese order - // so reverse the rejected elements to make it stable - cuda_cub::reverse(policy, result, last); - } - else - { -#if !__THRUST_HAS_CUDART__ - result = thrust::stable_partition(cvt_to_seq(derived_cast(policy)), - first, - last, - predicate); -#endif - } - return result; + auto ret = last; + CUB_CDP_DISPATCH( + (ret = __partition::partition_inplace(policy, + first, + last, + __partition::no_stencil_tag(), + predicate); + + /* partition returns rejected values in reverse order + so reverse the rejected elements to make it stable */ + cuda_cub::reverse(policy, ret, last);), + (ret = thrust::stable_partition(cvt_to_seq(derived_cast(policy)), + first, + last, + predicate);)); + return ret; } template #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include #include -#include -#include +#include #include +#include #include -#include -#include -#include -#include -#include +#include #include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -196,8 +197,11 @@ namespace __reduce { { cub::GridMappingStrategy grid_mapping; + CUB_RUNTIME_FUNCTION + Plan() {} + template - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION Plan(P) : core::AgentPlan(P()), grid_mapping(P::GRID_MAPPING) { @@ -691,7 +695,7 @@ namespace __reduce { class Size, class ReductionOp, class T> - cudaError_t THRUST_RUNTIME_FUNCTION + cudaError_t CUB_RUNTIME_FUNCTION doit_step(void * d_temp_storage, size_t & temp_storage_bytes, InputIt input_it, @@ -853,7 +857,7 @@ namespace __reduce { typename Size, typename T, typename BinaryOp> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION T reduce(execution_policy& policy, InputIt first, Size num_items, @@ -929,7 +933,7 @@ template -THRUST_RUNTIME_FUNCTION +CUB_RUNTIME_FUNCTION T reduce_n_impl(execution_policy& policy, InputIt first, Size num_items, @@ -1018,14 +1022,17 @@ T reduce_n(execution_policy& policy, T init, BinaryOp binary_op) { - if (__THRUST_HAS_CUDART__) - return thrust::cuda_cub::detail::reduce_n_impl( - policy, first, num_items, init, binary_op); - - #if !__THRUST_HAS_CUDART__ - return thrust::reduce( - cvt_to_seq(derived_cast(policy)), first, first + num_items, init, binary_op); - #endif + CUB_CDP_DISPATCH((init = thrust::cuda_cub::detail::reduce_n_impl(policy, + first, + num_items, + init, + binary_op);), + (init = thrust::reduce(cvt_to_seq(derived_cast(policy)), + first, + first + num_items, + init, + binary_op);)); + return init; } template diff --git a/thrust/system/cuda/detail/reduce_by_key.h b/thrust/system/cuda/detail/reduce_by_key.h index ba66f6d889..f19def66ef 100644 --- a/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/system/cuda/detail/reduce_by_key.h @@ -29,25 +29,26 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include +#include #include -#include -#include +#include +#include #include +#include #include -#include -#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -867,7 +868,7 @@ namespace __reduce_by_key { class EqualityOp, class ReductionOp, class Size> - THRUST_RUNTIME_FUNCTION cudaError_t + CUB_RUNTIME_FUNCTION cudaError_t doit_step(void * d_temp_storage, size_t & temp_storage_bytes, KeysInputIt keys_input_it, @@ -969,7 +970,7 @@ namespace __reduce_by_key { typename ValuesOutputIt, typename EqualityOp, typename ReductionOp> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION pair reduce_by_key(execution_policy& policy, KeysInputIt keys_first, @@ -1078,31 +1079,24 @@ reduce_by_key(execution_policy &policy, BinaryPred binary_pred, BinaryOp binary_op) { - pair ret = thrust::make_pair(keys_output, values_output); - if (__THRUST_HAS_CUDART__) - { - ret = __reduce_by_key::reduce_by_key(policy, - keys_first, - keys_last, - values_first, - keys_output, - values_output, - binary_pred, - binary_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::reduce_by_key(cvt_to_seq(derived_cast(policy)), - keys_first, - keys_last, - values_first, - keys_output, - values_output, - binary_pred, - binary_op); -#endif - } + auto ret = thrust::make_pair(keys_output, values_output); + CUB_CDP_DISPATCH((ret = __reduce_by_key::reduce_by_key(policy, + keys_first, + keys_last, + values_first, + keys_output, + values_output, + binary_pred, + binary_op);), + (ret = + thrust::reduce_by_key(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values_first, + keys_output, + values_output, + binary_pred, + binary_op);)); return ret; } diff --git a/thrust/system/cuda/detail/scan.h b/thrust/system/cuda/detail/scan.h index 0011c0f354..c0aa58611c 100644 --- a/thrust/system/cuda/detail/scan.h +++ b/thrust/system/cuda/detail/scan.h @@ -38,6 +38,7 @@ #include #include +#include #include THRUST_NAMESPACE_BEGIN @@ -220,26 +221,18 @@ OutputIt inclusive_scan_n(thrust::cuda_cub::execution_policy &policy, OutputIt result, ScanOp scan_op) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy, - first, - num_items, - result, - scan_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)), - first, - first + num_items, - result, - scan_op); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (result = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy, + first, + num_items, + result, + scan_op);), + (result = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)), + first, + first + num_items, + result, + scan_op);)); + return result; } template @@ -288,28 +281,20 @@ OutputIt exclusive_scan_n(thrust::cuda_cub::execution_policy &policy, T init, ScanOp scan_op) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = thrust::cuda_cub::detail::exclusive_scan_n_impl(policy, - first, - num_items, - result, - init, - scan_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::exclusive_scan(cvt_to_seq(derived_cast(policy)), - first, - first + num_items, - result, - init, - scan_op); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (result = thrust::cuda_cub::detail::exclusive_scan_n_impl(policy, + first, + num_items, + result, + init, + scan_op);), + (result = thrust::exclusive_scan(cvt_to_seq(derived_cast(policy)), + first, + first + num_items, + result, + init, + scan_op);)); + return result; } template #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC + #include +#include +#include #include +#include +#include +#include #include - #include -#include -#include -#include -#include -#include + +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -607,7 +610,7 @@ namespace __scan_by_key { T init; ScanOp scan_op; - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION AddInitToScan(T init_, ScanOp scan_op_) : init(init_), scan_op(scan_op_) {} @@ -632,7 +635,7 @@ namespace __scan_by_key { class ScanOp, class Size, class AddInitToScan> - THRUST_RUNTIME_FUNCTION cudaError_t + CUB_RUNTIME_FUNCTION cudaError_t doit_step(void * d_temp_storage, size_t & temp_storage_bytes, KeysInputIt keys_input_it, @@ -725,7 +728,7 @@ namespace __scan_by_key { typename EqualityOp, typename ScanOp, typename AddInitToScan> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION ValuesOutputIt scan_by_key(execution_policy& policy, KeysInputIt keys_first, KeysInputIt keys_last, @@ -802,36 +805,29 @@ inclusive_scan_by_key(execution_policy &policy, KeyInputIt key_first, KeyInputIt key_last, ValInputIt value_first, - ValOutputIt value_result, + ValOutputIt result, BinaryPred binary_pred, ScanOp scan_op) { - ValOutputIt ret = value_result; - if (__THRUST_HAS_CUDART__) - { - typedef typename iterator_traits::value_type T; - ret = __scan_by_key::scan_by_key(policy, - key_first, - key_last, - value_first, - value_result, - binary_pred, - scan_op, - __scan_by_key::DoNothing()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::inclusive_scan_by_key(cvt_to_seq(derived_cast(policy)), - key_first, - key_last, - value_first, - value_result, - binary_pred, - scan_op); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (using no_op_t = __scan_by_key::DoNothing>; + result = + __scan_by_key::scan_by_key(policy, + key_first, + key_last, + value_first, + result, + binary_pred, + scan_op, + no_op_t{});), + (result = thrust::inclusive_scan_by_key(cvt_to_seq(derived_cast(policy)), + key_first, + key_last, + value_first, + result, + binary_pred, + scan_op);)); + return result; } template &policy, KeyInputIt key_first, KeyInputIt key_last, ValInputIt value_first, - ValOutputIt value_result, + ValOutputIt result, Init init, BinaryPred binary_pred, ScanOp scan_op) { - ValOutputIt ret = value_result; - if (__THRUST_HAS_CUDART__) - { - ret = __scan_by_key::scan_by_key( - policy, - key_first, - key_last, - value_first, - value_result, - binary_pred, - scan_op, - __scan_by_key::AddInitToScan(init, scan_op)); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::exclusive_scan_by_key(cvt_to_seq(derived_cast(policy)), - key_first, - key_last, - value_first, - value_result, - init, - binary_pred, - scan_op); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (result = __scan_by_key::scan_by_key( + policy, + key_first, + key_last, + value_first, + result, + binary_pred, + scan_op, + __scan_by_key::AddInitToScan(init, scan_op));), + (result = thrust::exclusive_scan_by_key(cvt_to_seq(derived_cast(policy)), + key_first, + key_last, + value_first, + result, + init, + binary_pred, + scan_op);)); + return result; } template #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include #include +#include #include -#include -#include -#include -#include +#include #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include + +#include THRUST_NAMESPACE_BEGIN @@ -1116,7 +1118,7 @@ namespace __set_operations { class ValuesOutputIt, class CompareOp, class SetOp> - cudaError_t THRUST_RUNTIME_FUNCTION + cudaError_t CUB_RUNTIME_FUNCTION doit_step(void * d_temp_storage, size_t & temp_storage_size, KeysIt1 keys1, @@ -1247,7 +1249,7 @@ namespace __set_operations { typename ValuesOutputIt, typename CompareOp, typename SetOp> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION pair set_operations(execution_policy& policy, KeysIt1 keys1_first, @@ -1363,38 +1365,30 @@ set_difference(execution_policy &policy, OutputIt result, CompareOp compare) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typename thrust::iterator_value::type *null_ = NULL; - // - ret = __set_operations::set_operations( - policy, - items1_first, - items1_last, - items2_first, - items2_last, - null_, - null_, - result, - null_, - compare, - __set_operations::serial_set_difference()) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_difference(cvt_to_seq(derived_cast(policy)), - items1_first, - items1_last, - items2_first, - items2_last, - result, - compare); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (using items1_t = thrust::iterator_value_t; + items1_t *null_ = nullptr; + auto tmp = __set_operations::set_operations( + policy, + items1_first, + items1_last, + items2_first, + items2_last, + null_, + null_, + result, + null_, + compare, + __set_operations::serial_set_difference()); + result = tmp.first;), + (result = thrust::set_difference(cvt_to_seq(derived_cast(policy)), + items1_first, + items1_last, + items2_first, + items2_last, + result, + compare);)); + return result; } template &policy, OutputIt result, CompareOp compare) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typename thrust::iterator_value::type *null_ = NULL; - // - ret = __set_operations::set_operations( - policy, - items1_first, - items1_last, - items2_first, - items2_last, - null_, - null_, - result, - null_, - compare, - __set_operations::serial_set_intersection()) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_intersection(cvt_to_seq(derived_cast(policy)), - items1_first, - items1_last, - items2_first, - items2_last, - result, - compare); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (using items1_t = thrust::iterator_value_t; + items1_t *null_ = NULL; + auto tmp = __set_operations::set_operations( + policy, + items1_first, + items1_last, + items2_first, + items2_last, + null_, + null_, + result, + null_, + compare, + __set_operations::serial_set_intersection()); + result = tmp.first;), + (result = thrust::set_intersection(cvt_to_seq(derived_cast(policy)), + items1_first, + items1_last, + items2_first, + items2_last, + result, + compare);)); + return result; } template &policy, OutputIt result, CompareOp compare) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typename thrust::iterator_value::type *null_ = NULL; - // - ret = __set_operations::set_operations( - policy, - items1_first, - items1_last, - items2_first, - items2_last, - null_, - null_, - result, - null_, - compare, - __set_operations::serial_set_symmetric_difference()) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_symmetric_difference(cvt_to_seq(derived_cast(policy)), - items1_first, - items1_last, - items2_first, - items2_last, - result, - compare); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (using items1_t = thrust::iterator_value_t; + items1_t *null_ = nullptr; + auto tmp = __set_operations::set_operations( + policy, + items1_first, + items1_last, + items2_first, + items2_last, + null_, + null_, + result, + null_, + compare, + __set_operations::serial_set_symmetric_difference()); + result = tmp.first;), + (result = thrust::set_symmetric_difference(cvt_to_seq(derived_cast(policy)), + items1_first, + items1_last, + items2_first, + items2_last, + result, + compare);)); + return result; } - template &policy, OutputIt result, CompareOp compare) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - typename thrust::iterator_value::type *null_ = NULL; - // - ret = __set_operations::set_operations( - policy, - items1_first, - items1_last, - items2_first, - items2_last, - null_, - null_, - result, - null_, - compare, - __set_operations::serial_set_union()) - .first; - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_union(cvt_to_seq(derived_cast(policy)), - items1_first, - items1_last, - items2_first, - items2_last, - result, - compare); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (using items1_t = thrust::iterator_value_t; + items1_t *null_ = nullptr; + auto tmp = __set_operations::set_operations( + policy, + items1_first, + items1_last, + items2_first, + items2_last, + null_, + null_, + result, + null_, + compare, + __set_operations::serial_set_union()); + result = tmp.first;), + (result = thrust::set_union(cvt_to_seq(derived_cast(policy)), + items1_first, + items1_last, + items2_first, + items2_last, + result, + compare);)); + return result; } - template &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - ret = __set_operations::set_operations( - policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op, - __set_operations::serial_set_difference()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_difference_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + CUB_CDP_DISPATCH( + (ret = __set_operations::set_operations( + policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op, + __set_operations::serial_set_difference());), + (ret = thrust::set_difference_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);)); return ret; } @@ -1759,36 +1720,29 @@ set_intersection_by_key(execution_policy &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - ret = __set_operations::set_operations( - policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items1_first, - keys_result, - items_result, - compare_op, - __set_operations::serial_set_intersection()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_intersection_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + CUB_CDP_DISPATCH( + (ret = __set_operations::set_operations( + policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items1_first, + keys_result, + items_result, + compare_op, + __set_operations::serial_set_intersection());), + (ret = thrust::set_intersection_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + keys_result, + items_result, + compare_op);)); return ret; } @@ -1844,37 +1798,31 @@ set_symmetric_difference_by_key(execution_policy &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - ret = __set_operations::set_operations( - policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op, - __set_operations::serial_set_symmetric_difference()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_symmetric_difference_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + CUB_CDP_DISPATCH( + (ret = __set_operations::set_operations( + policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op, + __set_operations::serial_set_symmetric_difference());), + (ret = + thrust::set_symmetric_difference_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);)); return ret; } @@ -1932,37 +1880,30 @@ set_union_by_key(execution_policy &policy, ItemsOutputIt items_result, CompareOp compare_op) { - pair ret = thrust::make_pair(keys_result, items_result); - if (__THRUST_HAS_CUDART__) - { - ret = __set_operations::set_operations( - policy, - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op, - __set_operations::serial_set_union()); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::set_union_by_key(cvt_to_seq(derived_cast(policy)), - keys1_first, - keys1_last, - keys2_first, - keys2_last, - items1_first, - items2_first, - keys_result, - items_result, - compare_op); -#endif - } + auto ret = thrust::make_pair(keys_result, items_result); + CUB_CDP_DISPATCH( + (ret = __set_operations::set_operations( + policy, + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op, + __set_operations::serial_set_union());), + (ret = thrust::set_union_by_key(cvt_to_seq(derived_cast(policy)), + keys1_first, + keys1_last, + keys2_first, + keys2_last, + items1_first, + items2_first, + keys_result, + items_result, + compare_op);)); return ret; } diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h index 4babc3383b..d7d9627e2e 100644 --- a/thrust/system/cuda/detail/sort.h +++ b/thrust/system/cuda/detail/sort.h @@ -29,26 +29,30 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include -#include +#include +#include +#include +#include + #include #include #include -#include -#include - #include #include -#include -#include -#include -#include -#include -#include +#include + #include +#include +#include +#include +#include + #include +#include +#include + +#include THRUST_NAMESPACE_BEGIN namespace cuda_cub { @@ -59,7 +63,7 @@ namespace __merge_sort { class ItemsIt, class Size, class CompareOp> - THRUST_RUNTIME_FUNCTION cudaError_t + CUB_RUNTIME_FUNCTION cudaError_t doit_step(void* d_temp_storage, size_t& temp_storage_bytes, KeysIt keys, @@ -97,7 +101,7 @@ namespace __merge_sort { class ItemsIt, class Size, class CompareOp> - THRUST_RUNTIME_FUNCTION cudaError_t + CUB_RUNTIME_FUNCTION cudaError_t doit_step(void *d_temp_storage, size_t &temp_storage_bytes, KeysIt keys, @@ -129,7 +133,7 @@ namespace __merge_sort { class ItemsIt, class Size, class CompareOp> - THRUST_RUNTIME_FUNCTION cudaError_t + CUB_RUNTIME_FUNCTION cudaError_t doit_step(void *d_temp_storage, size_t &temp_storage_bytes, KeysIt keys, @@ -163,7 +167,7 @@ namespace __merge_sort { typename KeysIt, typename ItemsIt, typename CompareOp> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION void merge_sort(execution_policy& policy, KeysIt keys_first, KeysIt keys_last, @@ -220,7 +224,7 @@ namespace __radix_sort { struct dispatch > { template - THRUST_RUNTIME_FUNCTION static cudaError_t + CUB_RUNTIME_FUNCTION static cudaError_t doit(void* d_temp_storage, size_t& temp_storage_bytes, cub::DoubleBuffer& keys_buffer, @@ -245,7 +249,7 @@ namespace __radix_sort { struct dispatch > { template - THRUST_RUNTIME_FUNCTION static cudaError_t + CUB_RUNTIME_FUNCTION static cudaError_t doit(void* d_temp_storage, size_t& temp_storage_bytes, cub::DoubleBuffer& keys_buffer, @@ -270,7 +274,7 @@ namespace __radix_sort { struct dispatch > { template - THRUST_RUNTIME_FUNCTION static cudaError_t + CUB_RUNTIME_FUNCTION static cudaError_t doit(void* d_temp_storage, size_t& temp_storage_bytes, cub::DoubleBuffer& keys_buffer, @@ -296,7 +300,7 @@ namespace __radix_sort { struct dispatch > { template - THRUST_RUNTIME_FUNCTION static cudaError_t + CUB_RUNTIME_FUNCTION static cudaError_t doit(void* d_temp_storage, size_t& temp_storage_bytes, cub::DoubleBuffer& keys_buffer, @@ -323,7 +327,7 @@ namespace __radix_sort { typename Item, typename Size, typename CompareOp> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION void radix_sort(execution_policy& policy, Key* keys, Item* items, @@ -430,7 +434,7 @@ namespace __smart_sort { class KeysIt, class ItemsIt, class CompareOp> - THRUST_RUNTIME_FUNCTION typename enable_if_comparison_sort::type + CUB_RUNTIME_FUNCTION typename enable_if_comparison_sort::type smart_sort(Policy& policy, KeysIt keys_first, KeysIt keys_last, @@ -451,7 +455,7 @@ namespace __smart_sort { class KeysIt, class ItemsIt, class CompareOp> - THRUST_RUNTIME_FUNCTION typename enable_if_primitive_sort::type + CUB_RUNTIME_FUNCTION typename enable_if_primitive_sort::type smart_sort(execution_policy& policy, KeysIt keys_first, KeysIt keys_last, @@ -515,18 +519,15 @@ sort(execution_policy& policy, ItemsIt last, CompareOp compare_op) { - if (__THRUST_HAS_CUDART__) - { - typedef typename thrust::iterator_value::type item_type; - __smart_sort::smart_sort( - policy, first, last, (item_type*)NULL, compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - thrust::sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op); -#endif - } + CUB_CDP_DISPATCH( + (using item_t = thrust::iterator_value_t; item_t *null_ = nullptr; + __smart_sort::smart_sort(policy, + first, + last, + null_, + compare_op);), + (thrust::sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op);)); } __thrust_exec_check_disable__ @@ -537,18 +538,18 @@ stable_sort(execution_policy& policy, ItemsIt last, CompareOp compare_op) { - if (__THRUST_HAS_CUDART__) - { - typedef typename thrust::iterator_value::type item_type; - __smart_sort::smart_sort( - policy, first, last, (item_type*)NULL, compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - thrust::stable_sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op); -#endif - } + CUB_CDP_DISPATCH( + (using item_t = thrust::iterator_value_t; item_t *null_ = nullptr; + __smart_sort::smart_sort(policy, + first, + last, + null_, + compare_op);), + (thrust::stable_sort(cvt_to_seq(derived_cast(policy)), + first, + last, + compare_op);)); } __thrust_exec_check_disable__ @@ -560,18 +561,18 @@ sort_by_key(execution_policy& policy, ValuesIt values, CompareOp compare_op) { - if (__THRUST_HAS_CUDART__) - { - __smart_sort::smart_sort( - policy, keys_first, keys_last, values, compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - thrust::sort_by_key( - cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values, compare_op); -#endif - } + CUB_CDP_DISPATCH( + (__smart_sort::smart_sort(policy, + keys_first, + keys_last, + values, + compare_op);), + (thrust::sort_by_key(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values, + compare_op);)); } __thrust_exec_check_disable__ @@ -586,18 +587,18 @@ stable_sort_by_key(execution_policy &policy, ValuesIt values, CompareOp compare_op) { - if (__THRUST_HAS_CUDART__) - { - __smart_sort::smart_sort( - policy, keys_first, keys_last, values, compare_op); - } - else - { -#if !__THRUST_HAS_CUDART__ - thrust::stable_sort_by_key( - cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values, compare_op); -#endif - } + CUB_CDP_DISPATCH( + (__smart_sort::smart_sort(policy, + keys_first, + keys_last, + values, + compare_op);), + (thrust::stable_sort_by_key(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values, + compare_op);)); } // API with default comparator diff --git a/thrust/system/cuda/detail/unique.h b/thrust/system/cuda/detail/unique.h index 91dd2b84fb..67c5267b1b 100644 --- a/thrust/system/cuda/detail/unique.h +++ b/thrust/system/cuda/detail/unique.h @@ -29,20 +29,21 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include -#include -#include -#include #include -#include -#include -#include -#include -#include #include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -545,7 +546,7 @@ namespace __unique { class BinaryPred, class Size, class NumSelectedOutIt> - static cudaError_t THRUST_RUNTIME_FUNCTION + static cudaError_t CUB_RUNTIME_FUNCTION doit_step(void * d_temp_storage, size_t & temp_storage_bytes, ItemsInputIt items_in, @@ -632,7 +633,7 @@ namespace __unique { typename ItemsInputIt, typename ItemsOutputIt, typename BinaryPred> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION ItemsOutputIt unique(execution_policy& policy, ItemsInputIt items_first, ItemsInputIt items_last, @@ -719,26 +720,14 @@ unique_copy(execution_policy &policy, OutputIt result, BinaryPred binary_pred) { - OutputIt ret = result; - if (__THRUST_HAS_CUDART__) - { - ret = __unique::unique(policy, - first, - last, - result, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::unique_copy(cvt_to_seq(derived_cast(policy)), - first, - last, - result, - binary_pred); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (result = __unique::unique(policy, first, last, result, binary_pred);), + (result = thrust::unique_copy(cvt_to_seq(derived_cast(policy)), + first, + last, + result, + binary_pred);)); + return result; } template &policy, InputIt last, BinaryPred binary_pred) { - InputIt ret = first; - if (__THRUST_HAS_CUDART__) - { - ret = cuda_cub::unique_copy(policy, first, last, first, binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::unique(cvt_to_seq(derived_cast(policy)), - first, - last, - binary_pred); -#endif - } - return ret; + CUB_CDP_DISPATCH( + (last = cuda_cub::unique_copy(policy, first, last, first, binary_pred);), + (last = thrust::unique(cvt_to_seq(derived_cast(policy)), + first, + last, + binary_pred);)); + return last; } template #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include #include #include -#include -#include +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -623,7 +624,7 @@ namespace __unique_by_key { class BinaryPred, class Size, class NumSelectedOutIt> - static cudaError_t THRUST_RUNTIME_FUNCTION + static cudaError_t CUB_RUNTIME_FUNCTION doit_step(void * d_temp_storage, size_t & temp_storage_bytes, KeyInputIt keys_in, @@ -718,7 +719,7 @@ namespace __unique_by_key { typename KeyOutputIt, typename ValOutputIt, typename BinaryPred> - THRUST_RUNTIME_FUNCTION + CUB_RUNTIME_FUNCTION pair unique_by_key(execution_policy& policy, KeyInputIt keys_first, @@ -824,29 +825,22 @@ unique_by_key_copy(execution_policy &policy, ValOutputIt values_result, BinaryPred binary_pred) { - pair ret = thrust::make_pair(keys_result, values_result); - if (__THRUST_HAS_CUDART__) - { - ret = __unique_by_key::unique_by_key(policy, - keys_first, - keys_last, - values_first, - keys_result, - values_result, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)), - keys_first, - keys_last, - values_first, - keys_result, - values_result, - binary_pred); -#endif - } + auto ret = thrust::make_pair(keys_result, values_result); + CUB_CDP_DISPATCH( + (ret = __unique_by_key::unique_by_key(policy, + keys_first, + keys_last, + values_first, + keys_result, + values_result, + binary_pred);), + (ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values_first, + keys_result, + values_result, + binary_pred);)); return ret; } @@ -884,27 +878,20 @@ unique_by_key(execution_policy &policy, ValInputIt values_first, BinaryPred binary_pred) { - pair ret = thrust::make_pair(keys_first, values_first); - if (__THRUST_HAS_CUDART__) - { - ret = cuda_cub::unique_by_key_copy(policy, - keys_first, - keys_last, - values_first, - keys_first, - values_first, - binary_pred); - } - else - { -#if !__THRUST_HAS_CUDART__ - ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)), - keys_first, - keys_last, - values_first, - binary_pred); -#endif - } + auto ret = thrust::make_pair(keys_first, values_first); + CUB_CDP_DISPATCH( + (ret = cuda_cub::unique_by_key_copy(policy, + keys_first, + keys_last, + values_first, + keys_first, + values_first, + binary_pred);), + (ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)), + keys_first, + keys_last, + values_first, + binary_pred);)); return ret; } diff --git a/thrust/system/cuda/detail/util.h b/thrust/system/cuda/detail/util.h index 602b77ebcc..366221615f 100644 --- a/thrust/system/cuda/detail/util.h +++ b/thrust/system/cuda/detail/util.h @@ -99,20 +99,20 @@ synchronize_stream(execution_policy &policy) // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device // instructions out of the target logic. -#if __THRUST_HAS_CUDART__ +#ifdef CUB_RUNTIME_ENABLED #define THRUST_TEMP_DEVICE_CODE \ THRUST_UNUSED_VAR(policy); \ cub::detail::device_synchronize(); \ result = cudaGetLastError() -#else +#else // CUB_RUNTIME_ENABLED #define THRUST_TEMP_DEVICE_CODE \ THRUST_UNUSED_VAR(policy); \ result = cudaSuccess -#endif +#endif // CUB_RUNTIME_ENABLED NV_IF_TARGET(NV_IS_HOST, ( cudaStreamSynchronize(stream(policy)); @@ -255,10 +255,12 @@ terminate() __host__ __device__ inline void throw_on_error(cudaError_t status) { -#if __THRUST_HAS_CUDART__ // Clear the global CUDA error state which may have been set by the last // call. Otherwise, errors may "leak" to unrelated kernel launches. +#ifdef CUB_RUNTIME_ENABLED cudaGetLastError(); +#else + NV_IF_TARGET(NV_IS_HOST, (cudaGetLastError();), ()); #endif if (cudaSuccess != status) @@ -266,7 +268,7 @@ inline void throw_on_error(cudaError_t status) // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device // instructions out of the target logic. -#if __THRUST_HAS_CUDART__ +#ifdef CUB_RUNTIME_ENABLED #define THRUST_TEMP_DEVICE_CODE \ printf("Thrust CUDA backend error: %s: %s\n", \ @@ -296,17 +298,19 @@ inline void throw_on_error(cudaError_t status) __host__ __device__ inline void throw_on_error(cudaError_t status, char const *msg) { -#if __THRUST_HAS_CUDART__ // Clear the global CUDA error state which may have been set by the last // call. Otherwise, errors may "leak" to unrelated kernel launches. +#ifdef CUB_RUNTIME_ENABLED cudaGetLastError(); +#else + NV_IF_TARGET(NV_IS_HOST, (cudaGetLastError();), ()); #endif if (cudaSuccess != status) { // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device // instructions out of the target logic. -#if __THRUST_HAS_CUDART__ +#ifdef CUB_RUNTIME_ENABLED #define THRUST_TEMP_DEVICE_CODE \ printf("Thrust CUDA backend error: %s: %s: %s\n", \