Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Use CUB's new CDP macros.
Browse files Browse the repository at this point in the history
  • Loading branch information
alliepiper committed May 17, 2022
1 parent 71fab9e commit 3e07bf5
Show file tree
Hide file tree
Showing 25 changed files with 1,025 additions and 2,042 deletions.
2 changes: 1 addition & 1 deletion dependencies/cub
Submodule cub updated 49 files
+74 −0 cub/detail/detect_cuda_runtime.cuh
+2 −8 cub/detail/device_synchronize.cuh
+2 −2 cub/detail/type_traits.cuh
+5 −5 cub/device/device_adjacent_difference.cuh
+8 −8 cub/device/device_histogram.cuh
+6 −6 cub/device/device_merge_sort.cuh
+9 −9 cub/device/device_partition.cuh
+8 −8 cub/device/device_radix_sort.cuh
+8 −8 cub/device/device_reduce.cuh
+2 −2 cub/device/device_run_length_encode.cuh
+15 −15 cub/device/device_scan.cuh
+8 −8 cub/device/device_segmented_radix_sort.cuh
+7 −7 cub/device/device_segmented_reduce.cuh
+16 −16 cub/device/device_segmented_sort.cuh
+6 −6 cub/device/device_select.cuh
+1 −1 cub/device/device_spmv.cuh
+3 −3 cub/device/dispatch/dispatch_adjacent_difference.cuh
+10 −19 cub/device/dispatch/dispatch_histogram.cuh
+9 −9 cub/device/dispatch/dispatch_merge_sort.cuh
+16 −46 cub/device/dispatch/dispatch_radix_sort.cuh
+9 −36 cub/device/dispatch/dispatch_reduce.cuh
+4 −29 cub/device/dispatch/dispatch_reduce_by_key.cuh
+5 −15 cub/device/dispatch/dispatch_rle.cuh
+4 −16 cub/device/dispatch/dispatch_scan.cuh
+4 −15 cub/device/dispatch/dispatch_scan_by_key.cuh
+52 −47 cub/device/dispatch/dispatch_segmented_sort.cuh
+4 −28 cub/device/dispatch/dispatch_select_if.cuh
+4 −12 cub/device/dispatch/dispatch_spmv_orig.cuh
+4 −4 cub/device/dispatch/dispatch_three_way_partition.cuh
+9 −18 cub/device/dispatch/dispatch_unique_by_key.cuh
+26 −12 cub/util_arch.cuh
+3 −5 cub/util_debug.cuh
+14 −57 cub/util_device.cuh
+51 −13 test/CMakeLists.txt
+21 −9 test/README.md
+0 −39 test/test_allocator.cu
+34 −0 test/test_cdp_variant_state.cu
+4 −75 test/test_device_histogram.cu
+350 −156 test/test_device_radix_sort.cu
+305 −172 test/test_device_reduce.cu
+126 −84 test/test_device_reduce_by_key.cu
+129 −89 test/test_device_run_length_encode.cu
+111 −97 test/test_device_scan.cu
+128 −110 test/test_device_scan_by_key.cu
+12 −8 test/test_device_segmented_sort.cu
+125 −81 test/test_device_select_if.cu
+97 −69 test/test_device_select_unique.cu
+111 −74 test/test_device_select_unique_by_key.cu
+85 −15 test/test_util.h
39 changes: 22 additions & 17 deletions thrust/system/cuda/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,34 @@
// older releases. This header will always pull in version info:
#include <cub/util_namespace.cuh>

#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
# define __THRUST_HAS_CUDART__ 1
# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
# else
# define __THRUST_HAS_CUDART__ 0
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
# endif
#else
# define __THRUST_HAS_CUDART__ 0
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
#endif

// These definitions were intended for internal use only and are now obsolete.
// If you relied on them, consider porting your code to use the functionality
// in libcu++'s <nv/target> header.
// in libcu++'s <nv/target> header (for `THRUST_DEVICE_CODE`) or the new CDP
// macros in CUB's <cub/detail/detect_cuda_runtime.cuh> header (for
// `__THRUST_HAS_CUDART__` and `THRUST_RUNTIME_FUNCTION`).
//
// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
// them available again. These should be considered deprecated and will be
// fully removed in a future version.
#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
#ifdef __CUDA_ARCH__
#define THRUST_DEVICE_CODE
#endif // __CUDA_ARCH__

# if defined(__CUDACC__) || defined(_NVHPC_CUDA)
# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
# define __THRUST_HAS_CUDART__ 1
# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
# else
# define __THRUST_HAS_CUDART__ 0
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
# endif
# else
# define __THRUST_HAS_CUDART__ 0
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
# endif

# ifdef __CUDA_ARCH__
# define THRUST_DEVICE_CODE
# endif // __CUDA_ARCH__

#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS

#ifdef THRUST_AGENT_ENTRY_NOINLINE
Expand Down
43 changes: 18 additions & 25 deletions thrust/system/cuda/detail/adjacent_difference.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
#include <thrust/detail/config.h>

#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC

#include <thrust/detail/cstdint.h>
#include <thrust/detail/minmax.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/type_traits.h>
#include <thrust/functional.h>
#include <thrust/system/cuda/config.h>
#include <thrust/system/cuda/detail/cdp_dispatch.h>
#include <thrust/system/cuda/detail/dispatch.h>
#include <thrust/system/cuda/detail/par_to_seq.h>
#include <thrust/system/cuda/detail/util.h>
Expand Down Expand Up @@ -64,7 +66,7 @@ namespace __adjacent_difference {
class InputIt,
class OutputIt,
class BinaryOp>
cudaError_t THRUST_RUNTIME_FUNCTION
cudaError_t CUB_CDP_FUNCTION
doit_step(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIt first,
Expand Down Expand Up @@ -114,7 +116,7 @@ namespace __adjacent_difference {
template <class InputIt,
class OutputIt,
class BinaryOp>
cudaError_t THRUST_RUNTIME_FUNCTION
cudaError_t CUB_CDP_FUNCTION
doit_step(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIt first,
Expand All @@ -139,7 +141,7 @@ namespace __adjacent_difference {
template <class InputIt,
class OutputIt,
class BinaryOp>
cudaError_t THRUST_RUNTIME_FUNCTION
cudaError_t CUB_CDP_FUNCTION
doit_step(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIt first,
Expand Down Expand Up @@ -181,7 +183,7 @@ namespace __adjacent_difference {
typename InputIt,
typename OutputIt,
typename BinaryOp>
OutputIt THRUST_RUNTIME_FUNCTION
OutputIt CUB_CDP_FUNCTION
adjacent_difference(execution_policy<Derived>& policy,
InputIt first,
InputIt last,
Expand Down Expand Up @@ -260,27 +262,18 @@ adjacent_difference(execution_policy<Derived> &policy,
OutputIt result,
BinaryOp binary_op)
{
OutputIt ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __adjacent_difference::adjacent_difference(policy,
first,
last,
result,
binary_op);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
first,
last,
result,
binary_op);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __adjacent_difference::adjacent_difference(policy,
first,
last,
result,
binary_op);),
(result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
first,
last,
result,
binary_op);));
return result;
}

template <class Derived,
Expand Down
72 changes: 72 additions & 0 deletions thrust/system/cuda/detail/cdp_dispatch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* \file
* Utilities for CUDA dynamic parallelism.
*/

#pragma once

#include <cub/config.cuh>
#include <cub/detail/detect_cuda_runtime.cuh>

#include <nv/target>

/**
* \def THRUST_CDP_DISPATCH
*
* If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always
* run the parallel implementation. Otherwise, run the parallel implementation
* when called from the host, and fallback to the sequential implementation on
* the device.
*
* `par_impl` and `seq_impl` are blocks of C++ statements enclosed in
* parentheses, similar to NV_IF_TARGET blocks:
*
* \code
* THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();));
* \endcode
*/

#ifdef CUB_CDP_ENABLED

// seq_impl unused.
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
NV_IF_TARGET(NV_ANY_TARGET, par_impl)

#else // CUB_CDP_ENABLED

// Special case for NVCC -- need to inform the device path about the kernels
// that are launched from the host path.
#if defined(__CUDACC__) && defined(__CUDA_ARCH__)

// Device-side launch not supported, fallback to sequential in device code.
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
if (false) \
{ /* Without this, the device pass won't compile any kernels. */ \
NV_IF_TARGET(NV_ANY_TARGET, par_impl); \
} \
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)

#else // NVCC device pass

#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)

#endif // NVCC device pass

#endif // CUB_CDP_ENABLED
45 changes: 16 additions & 29 deletions thrust/system/cuda/detail/copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@

#include <thrust/detail/config.h>

#include <thrust/advance.h>

#include <thrust/system/cuda/config.h>
#include <thrust/system/cuda/detail/cdp_dispatch.h>
#include <thrust/system/cuda/detail/execution_policy.h>
#include <thrust/system/cuda/detail/cross_system.h>

Expand Down Expand Up @@ -117,22 +120,11 @@ copy(execution_policy<System> &system,
InputIterator last,
OutputIterator result)
{
OutputIterator ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __copy::device_to_device(system, first, last, result);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::copy(cvt_to_seq(derived_cast(system)),
first,
last,
result);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __copy::device_to_device(system, first, last, result);),
(result =
thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);));
return result;
} // end copy()

__thrust_exec_check_disable__
Expand All @@ -146,19 +138,14 @@ copy_n(execution_policy<System> &system,
Size n,
OutputIterator result)
{
OutputIterator ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __copy::device_to_device(system, first, first + n, result);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __copy::device_to_device(system,
first,
thrust::next(first, n),
result);),
(result =
thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);));
return result;
} // end copy_n()
#endif

Expand Down
Loading

0 comments on commit 3e07bf5

Please sign in to comment.