Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work around issues with cccl main #15552

Merged
merged 49 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
b74a236
Always use `cuda::proclaim_return_type` with device lambdas
miscco Apr 17, 2024
942291c
Work around issue with `thrust::pair` that prevents CTAD
miscco Apr 17, 2024
e172942
Apply suggestions from code review
bdice Apr 17, 2024
83bc096
Merge branch 'branch-24.06' into fix_cccl_compat
ttnghia Apr 19, 2024
32eab67
Merge branch 'fix_cccl_compat' of github.com:miscco/cudf into miscco-…
trxcllnt Apr 22, 2024
dec2915
Merge branch 'branch-24.06' of github.com:rapidsai/cudf into miscco-f…
trxcllnt Apr 22, 2024
6ca0c2a
fix lint
trxcllnt Apr 22, 2024
9b09a9e
Merge branch 'branch-24.06' of github.com:rapidsai/cudf into miscco-f…
trxcllnt May 8, 2024
c39db41
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 8, 2024
4bda146
Merge branch 'fix_cccl_compat' of github.com:miscco/cudf into miscco-…
trxcllnt May 8, 2024
04f1990
use std::min instead of cuda::std::min
trxcllnt May 8, 2024
586f502
use cuda::proclaim_return_type
trxcllnt May 8, 2024
3598c8f
remove test for int16_t key type that's unsupported by cuda::atomic_ref
trxcllnt May 8, 2024
69796d5
regenerate and add patches that apply to CCCL main
trxcllnt May 8, 2024
8c43425
don't modify whitespace in patches
trxcllnt May 8, 2024
991c789
don't run clang-format on files in cpp/build/*
trxcllnt May 8, 2024
60aecd8
Merge branch 'branch-24.06' of github.com:rapidsai/cudf into fix/cccl…
trxcllnt May 9, 2024
9320968
update devcontainer workflow to use NVIDIA/cccl#pull-request/1667
trxcllnt May 9, 2024
69bf346
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 13, 2024
6a758bf
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 14, 2024
bb67523
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 14, 2024
4ff0c59
test rapids-cmake with CCCL 2.5
trxcllnt May 14, 2024
f122905
pass cuco::cuda_stream_ref
trxcllnt May 14, 2024
434600e
revert changes to pr.yaml
trxcllnt May 14, 2024
75a1606
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 15, 2024
9872c7c
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 15, 2024
d85c763
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 15, 2024
7c1abf8
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 16, 2024
b14899f
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 21, 2024
ffdab59
Update cpp/src/join/distinct_hash_join.cu
trxcllnt May 21, 2024
57d9eea
fix lint
trxcllnt May 21, 2024
7b0e75e
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 21, 2024
9b5bc7a
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 21, 2024
a011739
Apply suggestions from reviewers
trxcllnt May 21, 2024
78c1a89
revert more thrust::pair changes
trxcllnt May 21, 2024
68c5997
fix lint
trxcllnt May 22, 2024
a4e123a
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 22, 2024
114db08
Apply suggestions from code review
trxcllnt May 22, 2024
64264a7
fix lint
trxcllnt May 23, 2024
48f22e2
Merge branch 'branch-24.06' of github.com:rapidsai/cudf into fix/cccl…
trxcllnt May 23, 2024
902f06b
include cuda/std/functional
trxcllnt May 23, 2024
3b29345
cuda::std::min -> std::min
trxcllnt May 23, 2024
264dda7
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 23, 2024
3fce393
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 23, 2024
d19d41d
Merge branch 'branch-24.06' into fix_cccl_compat
trxcllnt May 24, 2024
422538e
fix orc tests
trxcllnt May 24, 2024
774520c
Merge branch 'branch-24.06' of github.com:rapidsai/cudf into fix/cccl…
trxcllnt May 24, 2024
ef42695
compute and compare num_blocks instead of num_streams
trxcllnt May 24, 2024
d9a4947
revert changes to use rapids-cmake CCCL 2.5 branch
trxcllnt May 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ repos:
- id: trailing-whitespace
exclude: |
(?x)^(
^cpp/cmake/thirdparty/patches/.*|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
)
- id: end-of-file-fixer
exclude: |
(?x)^(
^cpp/cmake/thirdparty/patches/.*|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
)
- repo: https://github.com/PyCQA/isort
Expand Down
20 changes: 20 additions & 0 deletions cpp/cmake/thirdparty/patches/cccl_override.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
"issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
"issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
"fixed_in" : ""
},
{
"file": "cccl/kernel_pointer_hiding.diff",
"issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
Expand All @@ -28,15 +33,30 @@
"issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
"issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
"issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
"issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
"issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
"fixed_in" : ""
},
{
"file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
"issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
"fixed_in" : ""
}
]
}
Expand Down
47 changes: 47 additions & 0 deletions cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
index 046eb83c0..8047c9701 100644
--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -53,41 +53,15 @@ namespace cuda_cub

namespace __copy
{
-template <class Derived, class InputIt, class OutputIt>
-OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
- execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
-{
- typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
- const auto n = thrust::distance(first, last);
- if (n > 0)
- {
- cudaError status;
- status = trivial_copy_device_to_device(
- policy,
- reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
- reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
- n);
- cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
- }
-
- return result + n;
-}

template <class Derived, class InputIt, class OutputIt>
OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
- execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
+ execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
{
typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
}

-template <class Derived, class InputIt, class OutputIt>
-OutputIt THRUST_RUNTIME_FUNCTION
-device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
-{
- return device_to_device(
- policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
-}
} // namespace __copy

} // namespace cuda_cub
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
index 2a3cc4e33..8fb337b26 100644
--- a/thrust/thrust/system/cuda/detail/dispatch.h
+++ b/thrust/thrust/system/cuda/detail/dispatch.h
@@ -44,8 +44,7 @@
} \
else \
{ \
- auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
- status = call arguments; \
+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
}

/**
@@ -66,9 +65,7 @@
} \
else \
{ \
- auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
- auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
- status = call arguments; \
+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
}
/**
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 0606485bb..dbb99ff13 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
};

/// SM60 (GP100)
- struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
{
enum
{
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index f39613adb..75bd16ff9 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -488,7 +488,7 @@ struct DeviceReducePolicy
};

/// SM60
- struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
{
static constexpr int threads_per_block = 256;
static constexpr int items_per_thread = 16;
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index 419908c4e..6ab0840e1 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -339,7 +339,7 @@ struct DeviceScanPolicy
/// SM600
struct Policy600
: DefaultTuning
- , ChainedPolicy<600, Policy600, Policy520>
+ , ChainedPolicy<600, Policy600, Policy600>
{};

/// SM800
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
index eb76ebb0b..c6c529a50 100644
--- a/cub/cub/block/block_merge_sort.cuh
+++ b/cub/cub/block/block_merge_sort.cuh
@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
KeyT key1 = keys_shared[keys1_beg];
KeyT key2 = keys_shared[keys2_beg];

-#pragma unroll
+#pragma unroll 1
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
{
bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
@@ -376,7 +376,7 @@ public:
//
KeyT max_key = oob_default;

-#pragma unroll
+#pragma unroll 1
for (int item = 1; item < ITEMS_PER_THREAD; ++item)
{
if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
index 7d9e8622f..da5627306 100644
--- a/cub/cub/thread/thread_sort.cuh
+++ b/cub/cub/thread/thread_sort.cuh
@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
{
constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;

-#pragma unroll
+#pragma unroll 1
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
-#pragma unroll
+#pragma unroll 1
for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
{
if (compare_op(keys[j + 1], keys[j]))
9 changes: 5 additions & 4 deletions cpp/src/io/comp/statistics.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <rmm/exec_policy.hpp>

#include <cuda/functional>
#include <thrust/transform_reduce.h>

namespace cudf::io {
Expand All @@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics(
rmm::exec_policy(stream),
results.begin(),
results.end(),
[] __device__(auto& res) {
cuda::proclaim_return_type<size_t>([] __device__(compression_result const& res) {
return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
},
}),
0ul,
thrust::plus<size_t>());

Expand All @@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics(
rmm::exec_policy(stream),
zipped_begin,
zipped_end,
[status] __device__(auto tup) {
cuda::proclaim_return_type<size_t>([status] __device__(auto tup) {
return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
},
}),
0ul,
thrust::plus<size_t>());
};
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/io/orc/reader_impl_decode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
d_sizes = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
// Since the number of rows may not divisible by segment_length,
// the last segment may be shorter than the others.
auto const current_length =
cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx);
trxcllnt marked this conversation as resolved.
Show resolved Hide resolved
auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes
return cumulative_size{static_cast<std::size_t>(current_length),
static_cast<std::size_t>(size)};
Expand Down
13 changes: 8 additions & 5 deletions cpp/src/io/parquet/page_string_decode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1197,14 +1197,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
cudf::detail::join_streams(streams, stream);

// check for needed temp space for DELTA_BYTE_ARRAY
auto const need_sizes = thrust::any_of(
rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) {
return page.temp_string_size != 0;
});
auto const need_sizes =
thrust::any_of(rmm::exec_policy(stream),
pages.device_begin(),
pages.device_end(),
cuda::proclaim_return_type<bool>(
[] __device__(auto const& page) { return page.temp_string_size != 0; }));
trxcllnt marked this conversation as resolved.
Show resolved Hide resolved

if (need_sizes) {
// sum up all of the temp_string_sizes
auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
auto const page_sizes = cuda::proclaim_return_type<int64_t>(
[] __device__(PageInfo const& page) { return page.temp_string_size; });
auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
pages.device_begin(),
pages.device_end(),
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/parquet/reader_impl_preprocess.cu
Original file line number Diff line number Diff line change
Expand Up @@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding)
[[nodiscard]] std::string list_unsupported_encodings(device_span<PageInfo const> pages,
rmm::cuda_stream_view stream)
{
auto const to_mask = [] __device__(auto const& page) {
auto const to_mask = cuda::proclaim_return_type<uint32_t>([] __device__(auto const& page) {
return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding);
};
});
uint32_t const unsupported = thrust::transform_reduce(
rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or<uint32_t>());
return encoding_bitmask_to_str(unsupported);
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/io/utilities/data_casting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <rmm/resource_ref.hpp>

#include <cub/cub.cuh>
#include <cuda/functional>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
Expand Down Expand Up @@ -783,7 +784,8 @@ template <typename SymbolT>
struct to_string_view_pair {
SymbolT const* data;
to_string_view_pair(SymbolT const* _data) : data(_data) {}
__device__ auto operator()(thrust::tuple<size_type, size_type> ip)
__device__ thrust::pair<char const*, std::size_t> operator()(
thrust::tuple<size_type, size_type> ip)
{
return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
static_cast<std::size_t>(thrust::get<1>(ip))};
Expand All @@ -805,7 +807,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
rmm::exec_policy(stream),
str_tuples,
str_tuples + col_size,
[] __device__(auto t) { return t.second; },
cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
size_type{0},
thrust::maximum<size_type>{});

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/distinct_hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});

auto const [probe_indices_end, _] = this->_hash_table.retrieve(
iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});

auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
build_indices->resize(actual_size, stream);
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/split/split_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,9 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
[d_offsets] __device__(auto const idx) -> size_type {
cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto const idx) -> size_type {
return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
},
}),
0,
thrust::maximum<size_type>{});

Expand Down
1 change: 0 additions & 1 deletion cpp/tests/hash_map/map_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture {

using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
key_value_types<int64_t, int64_t>,
key_value_types<int16_t, int16_t>,
key_value_types<int32_t, float>,
key_value_types<int64_t, double>>;

Expand Down
3 changes: 3 additions & 0 deletions rapids_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ else()
)
endif()

set(rapids-cmake-repo trxcllnt/rapids-cmake)
set(rapids-cmake-branch fea/cccl-2.5)

trxcllnt marked this conversation as resolved.
Show resolved Hide resolved
if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
file(
DOWNLOAD
Expand Down
Loading