From b74a236e5ec455b8539317304d1908f3ba9220f1 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 17 Apr 2024 17:08:55 +0200 Subject: [PATCH 01/26] Always use `cuda::proclaim_return_type` with device lambdas Co-authored-by: Bernhard Manfred Gruber --- cpp/src/io/comp/statistics.cu | 9 +++++---- cpp/src/io/parquet/page_string_decode.cu | 6 +++--- cpp/src/io/parquet/reader_impl_preprocess.cu | 4 ++-- cpp/src/io/utilities/data_casting.cu | 5 +++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu index 2a9eb782800..e268f7a5eb3 100644 --- a/cpp/src/io/comp/statistics.cu +++ b/cpp/src/io/comp/statistics.cu @@ -18,6 +18,7 @@ #include +#include #include namespace cudf::io { @@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics( rmm::exec_policy(stream), results.begin(), results.end(), - [] __device__(auto& res) { + cuda::proclaim_return_type([] __device__(const compression_result& res) { return res.status == compression_status::SUCCESS ? res.bytes_written : 0; - }, + }), 0ul, thrust::plus()); @@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics( rmm::exec_policy(stream), zipped_begin, zipped_end, - [status] __device__(auto tup) { + cuda::proclaim_return_type([status] __device__(auto tup) { return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0; - }, + }), 0ul, thrust::plus()); }; diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 8bb56c66d0f..1fb4741e9ab 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -1186,13 +1186,13 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span pages, // check for needed temp space for DELTA_BYTE_ARRAY auto const need_sizes = thrust::any_of( - rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) { + rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), cuda::proclaim_return_type([] __device__(auto& page) { return page.temp_string_size != 0; - }); + })); if (need_sizes) { // sum up all of the temp_string_sizes - auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; }; + auto const page_sizes = cuda::proclaim_return_type([] __device__(PageInfo const& page) { return page.temp_string_size; }); auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 1b0a10be811..6dee87b7f3e 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding) [[nodiscard]] std::string list_unsupported_encodings(device_span pages, rmm::cuda_stream_view stream) { - auto const to_mask = [] __device__(auto const& page) { + auto const to_mask = cuda::proclaim_return_type([] __device__(auto const& page) { return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding); - }; + }); uint32_t const unsupported = thrust::transform_reduce( rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or()); return encoding_bitmask_to_str(unsupported); diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 4b5d47e71fb..bb928085d42 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -782,7 +783,7 @@ template struct to_string_view_pair { SymbolT const* data; to_string_view_pair(SymbolT const* _data) : data(_data) {} - __device__ auto operator()(thrust::tuple ip) + __device__ thrust::pair operator()(thrust::tuple ip) { return thrust::pair{data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; @@ -804,7 +805,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, rmm::exec_policy(stream), str_tuples, str_tuples + col_size, - [] __device__(auto t) { return t.second; }, + cuda::proclaim_return_type([] __device__(auto t) { return t.second; }), size_type{0}, thrust::maximum{}); From 942291c25323c4beb511d1632ea0592db9ce21a5 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 17 Apr 2024 17:09:36 +0200 Subject: [PATCH 02/26] Work around issue with `thrust::pair` that prevents CTAD --- cpp/src/groupby/sort/group_rank_scan.cu | 6 +++--- cpp/src/io/comp/nvcomp_adapter.cu | 2 +- cpp/src/lists/contains.cu | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index 5cf7844410e..a040480939d 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -133,10 +133,10 @@ std::unique_ptr rank_generator(column_view const& grouped_values, auto [group_labels_begin, mutable_rank_begin] = [&]() { if constexpr (forward) { - return thrust::pair{group_labels.begin(), mutable_ranks.begin()}; + return thrust::make_pair(group_labels.begin(), mutable_ranks.begin()); } else { - return thrust::pair{thrust::reverse_iterator(group_labels.end()), - thrust::reverse_iterator(mutable_ranks.end())}; + return thrust::make_pair(thrust::reverse_iterator(group_labels.end()), + thrust::reverse_iterator(mutable_ranks.end())); } }(); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 794d452ebf2..1a5c8b9493e 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -108,7 +108,7 @@ void skip_unsupported_inputs(device_span input_sizes, input_sizes.begin(), status_size_it, [] __device__(auto const& status) { - return thrust::pair{0, compression_result{0, compression_status::SKIPPED}}; + return thrust::make_pair(0, compression_result{0, compression_status::SKIPPED}); }, [max_size = max_valid_input_size.value()] __device__(size_t input_size) { return input_size > max_size; diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 378cf678f1f..8a512f759c9 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -103,9 +103,9 @@ __device__ auto element_index_pair_iter(size_type const size) auto const end = thrust::make_counting_iterator(size); if constexpr (forward) { - return thrust::pair{begin, end}; + return thrust::make_pair(begin, end); } else { - return thrust::pair{thrust::make_reverse_iterator(end), thrust::make_reverse_iterator(begin)}; + return thrust::make_pair(thrust::make_reverse_iterator(end), thrust::make_reverse_iterator(begin)); } } From e172942265b8c07d432c01c4f9088eed1b452178 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 17 Apr 2024 11:25:37 -0500 Subject: [PATCH 03/26] Apply suggestions from code review --- cpp/src/io/comp/statistics.cu | 2 +- cpp/src/io/parquet/page_string_decode.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu index e268f7a5eb3..faf967041bc 100644 --- a/cpp/src/io/comp/statistics.cu +++ b/cpp/src/io/comp/statistics.cu @@ -33,7 +33,7 @@ writer_compression_statistics collect_compression_statistics( rmm::exec_policy(stream), results.begin(), results.end(), - cuda::proclaim_return_type([] __device__(const compression_result& res) { + cuda::proclaim_return_type([] __device__(compression_result const& res) { return res.status == compression_status::SUCCESS ? res.bytes_written : 0; }), 0ul, diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 1fb4741e9ab..3fe227a89f2 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -1186,7 +1186,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span pages, // check for needed temp space for DELTA_BYTE_ARRAY auto const need_sizes = thrust::any_of( - rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), cuda::proclaim_return_type([] __device__(auto& page) { + rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), cuda::proclaim_return_type([] __device__(auto const& page) { return page.temp_string_size != 0; })); From 6ca0c2ad26556cf9ac5a875a9035421b617bebfe Mon Sep 17 00:00:00 2001 From: ptaylor Date: Mon, 22 Apr 2024 20:30:35 +0000 Subject: [PATCH 04/26] fix lint --- cpp/src/io/comp/nvcomp_adapter.cu | 2 +- cpp/src/io/parquet/page_string_decode.cu | 13 ++++++++----- cpp/src/io/utilities/data_casting.cu | 3 ++- cpp/src/lists/contains.cu | 3 ++- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 1a5c8b9493e..53db714fa68 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index f9556fbca84..806a2c4f01f 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -1182,14 +1182,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span pages, cudf::detail::join_streams(streams, stream); // check for needed temp space for DELTA_BYTE_ARRAY - auto const need_sizes = thrust::any_of( - rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), cuda::proclaim_return_type([] __device__(auto const& page) { - return page.temp_string_size != 0; - })); + auto const need_sizes = + thrust::any_of(rmm::exec_policy(stream), + pages.device_begin(), + pages.device_end(), + cuda::proclaim_return_type( + [] __device__(auto const& page) { return page.temp_string_size != 0; })); if (need_sizes) { // sum up all of the temp_string_sizes - auto const page_sizes = cuda::proclaim_return_type([] __device__(PageInfo const& page) { return page.temp_string_size; }); + auto const page_sizes = cuda::proclaim_return_type( + [] __device__(PageInfo const& page) { return page.temp_string_size; }); auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 91d7e9859ba..60cbfbc0dae 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -784,7 +784,8 @@ template struct to_string_view_pair { SymbolT const* data; to_string_view_pair(SymbolT const* _data) : data(_data) {} - __device__ thrust::pair operator()(thrust::tuple ip) + __device__ thrust::pair operator()( + thrust::tuple ip) { return thrust::pair{data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 6ee97aa18e6..4e8938b52de 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -106,7 +106,8 @@ __device__ auto element_index_pair_iter(size_type const size) if constexpr (forward) { return thrust::make_pair(begin, end); } else { - return thrust::make_pair(thrust::make_reverse_iterator(end), thrust::make_reverse_iterator(begin)); + return thrust::make_pair(thrust::make_reverse_iterator(end), + thrust::make_reverse_iterator(begin)); } } From 04f1990e57a0a2af92328f4dacf6cc60d8a0047d Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 8 May 2024 11:27:57 -0700 Subject: [PATCH 05/26] use std::min instead of cuda::std::min --- cpp/src/io/orc/reader_impl_decode.cu | 3 +-- cpp/src/transform/row_bit_count.cu | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index ec936b85761..c229c00b305 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -692,8 +692,7 @@ std::vector find_table_splits(table_view const& input, d_sizes = d_segmented_sizes->view().begin()] __device__(auto const segment_idx) { // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. - auto const current_length = - cuda::std::min(segment_length, num_rows - segment_length * segment_idx); + auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx); auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes return cumulative_size{static_cast(current_length), static_cast(size)}; diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index bfac7ab586e..04ac483fd55 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -425,7 +425,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span col // current row span - always starts at spanning over `segment_length` rows. auto const num_rows = cols[0].size(); auto const get_default_row_span = [=] { - return row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)}; + return row_span{tid * segment_length, std::min((tid + 1) * segment_length, num_rows)}; }; auto cur_span = get_default_row_span(); @@ -514,7 +514,7 @@ std::unique_ptr segmented_row_bit_count(table_view const& t, // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. auto const current_length = - cuda::std::min(segment_length, num_rows - segment_length * segment_idx); + std::min(segment_length, num_rows - segment_length * segment_idx); return per_row_size * current_length; })); return output; From 586f5029d9a8039f20ad654ec04cbc209d8061f7 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 8 May 2024 11:33:26 -0700 Subject: [PATCH 06/26] use cuda::proclaim_return_type --- cpp/src/strings/split/split_re.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 4dfb3e9ea62..e754e7ed05d 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -219,9 +219,9 @@ std::unique_ptr split_re(strings_column_view const& input, rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - [d_offsets] __device__(auto const idx) -> size_type { + cuda::proclaim_return_type([d_offsets] __device__(auto const idx) -> size_type { return static_cast(d_offsets[idx + 1] - d_offsets[idx]); - }, + }), 0, thrust::maximum{}); From 3598c8ff3bce66b54d41eb2e1cf63d9186fb6adf Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 8 May 2024 11:45:45 -0700 Subject: [PATCH 07/26] remove test for int16_t key type that's unsupported by cuda::atomic_ref --- cpp/tests/hash_map/map_test.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu index 4b10716706b..be2e33538b9 100644 --- a/cpp/tests/hash_map/map_test.cu +++ b/cpp/tests/hash_map/map_test.cu @@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture { using TestTypes = ::testing::Types, key_value_types, - key_value_types, key_value_types, key_value_types>; From 69796d5ad54351dbb111875f43d38fa598bebe43 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 8 May 2024 13:03:43 -0700 Subject: [PATCH 08/26] regenerate and add patches that apply to CCCL main --- .../thirdparty/patches/cccl_override.json | 20 ++++++++ .../patches/revert_pr_211_cccl_2.5.0.diff | 47 +++++++++++++++++++ ..._disable_64bit_dispatching_cccl_2.5.0.diff | 25 ++++++++++ ..._faster_scan_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++ ..._faster_sort_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++ 5 files changed, 170 insertions(+) create mode 100644 cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff create mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index b33f17f3e4a..059f713e7a5 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -18,6 +18,11 @@ "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", "fixed_in" : "" }, + { + "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff", + "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", + "fixed_in" : "" + }, { "file": "cccl/kernel_pointer_hiding.diff", "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]", @@ -28,15 +33,30 @@ "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", "fixed_in" : "" }, + { + "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff", + "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", + "fixed_in" : "" + }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", "fixed_in" : "" }, + { + "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff", + "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", + "fixed_in" : "" + }, { "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff", "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", "fixed_in" : "" + }, + { + "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff", + "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", + "fixed_in" : "" } ] } diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff new file mode 100644 index 00000000000..27ff16744f5 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff @@ -0,0 +1,47 @@ +diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h +index 046eb83c0..8047c9701 100644 +--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h ++++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h +@@ -53,41 +53,15 @@ namespace cuda_cub + + namespace __copy + { +-template +-OutputIt THRUST_RUNTIME_FUNCTION device_to_device( +- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type) +-{ +- typedef typename thrust::iterator_traits::value_type InputTy; +- const auto n = thrust::distance(first, last); +- if (n > 0) +- { +- cudaError status; +- status = trivial_copy_device_to_device( +- policy, +- reinterpret_cast(thrust::raw_pointer_cast(&*result)), +- reinterpret_cast(thrust::raw_pointer_cast(&*first)), +- n); +- cuda_cub::throw_on_error(status, "__copy:: D->D: failed"); +- } +- +- return result + n; +-} + + template + OutputIt THRUST_RUNTIME_FUNCTION device_to_device( +- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type) ++ execution_policy& policy, InputIt first, InputIt last, OutputIt result) + { + typedef typename thrust::iterator_traits::value_type InputTy; + return cuda_cub::transform(policy, first, last, result, thrust::identity()); + } + +-template +-OutputIt THRUST_RUNTIME_FUNCTION +-device_to_device(execution_policy& policy, InputIt first, InputIt last, OutputIt result) +-{ +- return device_to_device( +- policy, first, last, result, typename is_indirectly_trivially_relocatable_to::type()); +-} + } // namespace __copy + + } // namespace cuda_cub diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff new file mode 100644 index 00000000000..6ae1e1c917b --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff @@ -0,0 +1,25 @@ +diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h +index 2a3cc4e33..8fb337b26 100644 +--- a/thrust/thrust/system/cuda/detail/dispatch.h ++++ b/thrust/thrust/system/cuda/detail/dispatch.h +@@ -44,8 +44,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } + + /** +@@ -66,9 +65,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } + /** + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff new file mode 100644 index 00000000000..fee46046194 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff @@ -0,0 +1,39 @@ +diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh +index 0606485bb..dbb99ff13 100644 +--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh ++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh +@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy + }; + + /// SM60 (GP100) +- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + enum + { +diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh +index f39613adb..75bd16ff9 100644 +--- a/cub/cub/device/dispatch/dispatch_reduce.cuh ++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh +@@ -488,7 +488,7 @@ struct DeviceReducePolicy + }; + + /// SM60 +- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + static constexpr int threads_per_block = 256; + static constexpr int items_per_thread = 16; +diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh +index 419908c4e..6ab0840e1 100644 +--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh ++++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh +@@ -339,7 +339,7 @@ struct DeviceScanPolicy + /// SM600 + struct Policy600 + : DefaultTuning +- , ChainedPolicy<600, Policy600, Policy520> ++ , ChainedPolicy<600, Policy600, Policy600> + {}; + + /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff new file mode 100644 index 00000000000..cb0cc55f4d2 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff @@ -0,0 +1,39 @@ +diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh +index eb76ebb0b..c6c529a50 100644 +--- a/cub/cub/block/block_merge_sort.cuh ++++ b/cub/cub/block/block_merge_sort.cuh +@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( + KeyT key1 = keys_shared[keys1_beg]; + KeyT key2 = keys_shared[keys2_beg]; + +-#pragma unroll ++#pragma unroll 1 + for (int item = 0; item < ITEMS_PER_THREAD; ++item) + { + bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); +@@ -376,7 +376,7 @@ public: + // + KeyT max_key = oob_default; + +-#pragma unroll ++#pragma unroll 1 + for (int item = 1; item < ITEMS_PER_THREAD; ++item) + { + if (ITEMS_PER_THREAD * linear_tid + item < valid_items) +diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh +index 7d9e8622f..da5627306 100644 +--- a/cub/cub/thread/thread_sort.cuh ++++ b/cub/cub/thread/thread_sort.cuh +@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE + { + constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; + +-#pragma unroll ++#pragma unroll 1 + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { +-#pragma unroll ++#pragma unroll 1 + for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) + { + if (compare_op(keys[j + 1], keys[j])) From 8c43425352485c2fb09da0cc31f718818cf327a7 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 8 May 2024 13:12:18 -0700 Subject: [PATCH 09/26] don't modify whitespace in patches --- .pre-commit-config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0ae745257cb..f76eb6f2652 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,11 +7,13 @@ repos: - id: trailing-whitespace exclude: | (?x)^( + ^cpp/cmake/thirdparty/patches/.*| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - id: end-of-file-fixer exclude: | (?x)^( + ^cpp/cmake/thirdparty/patches/.*| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - repo: https://github.com/PyCQA/isort From 991c78953fd3aebc11c60891e7e391f366808f15 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 8 May 2024 13:18:27 -0700 Subject: [PATCH 10/26] don't run clang-format on files in cpp/build/* --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f76eb6f2652..cd082b8282a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -53,6 +53,10 @@ repos: - id: clang-format types_or: [c, c++, cuda] args: ["-fallback-style=none", "-style=file", "-i"] + exclude: | + (?x)^( + ^cpp/build/.* + ) - repo: https://github.com/sirosen/texthooks rev: 0.6.6 hooks: From 9320968bdb2acd37dfc74b77a27f79008d093935 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Thu, 9 May 2024 20:28:50 +0000 Subject: [PATCH 11/26] update devcontainer workflow to use NVIDIA/cccl#pull-request/1667 --- .github/workflows/pr.yaml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index f9d5976f1fe..f684a6156dc 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -157,8 +157,24 @@ jobs: arch: '["amd64"]' cuda: '["12.2"]' build_command: | + # Tell rapids-cmake to use custom CCCL and cuCollections forks + rapids_branch="$(yq '.x-git-defaults.tag' /opt/rapids-build-utils/manifest.yaml)"; + rapids_version="${rapids_branch#branch-}"; + curl -fsSL -o- https://raw.githubusercontent.com/trxcllnt/rapids-cmake/branch-24.04-cccl-2.4.0/rapids-cmake/cpm/patches/cccl/revert_pr_211.diff \ + | tee ~/rapids-cmake-revert_pr_211.diff; + curl -fsSL -o- "https://raw.githubusercontent.com/rapidsai/rapids-cmake/${rapids_branch}/rapids-cmake/cpm/versions.json" \ + | jq -r ".packages.CCCL *= {\"version\": \"2.5.0\", \"git_tag\": \"pull-request/1667\"}" \ + | jq -r "(.packages.CCCL.patches[] | select(.file == \"cccl/revert_pr_211.diff\")).file = \"${HOME}/rapids-cmake-revert_pr_211.diff\"" \ + | jq -r ".packages.cuco *= {\"git_url\": \"https://github.com/trxcllnt/cuCollections.git\", \"git_tag\": \"rapids-${rapids_version}-cccl-2.5.0\", \"always_download\": true}" \ + | tee ~/rapids-cmake-override-versions.json; sccache -z; - build-all -DBUILD_BENCHMARKS=ON --verbose; + build-all \ + -j$(nproc --ignore=1) -v \ + -DBUILD_TESTS=ON \ + -DBUILD_BENCHMARKS=ON \ + -DCMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0" \ + -DCMAKE_CUDA_FLAGS="-ftemplate-backtrace-limit=0" \ + -DRAPIDS_CMAKE_CPM_DEFAULT_VERSION_FILE="${HOME}/rapids-cmake-override-versions.json"; sccache -s; unit-tests-cudf-pandas: needs: wheel-build-cudf From 4ff0c5968fae67e89729bb5515067ffcd6d6740b Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 14 May 2024 21:33:56 +0000 Subject: [PATCH 12/26] test rapids-cmake with CCCL 2.5 --- rapids_config.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rapids_config.cmake b/rapids_config.cmake index 3a88769f6e7..e7137bb71dd 100644 --- a/rapids_config.cmake +++ b/rapids_config.cmake @@ -26,6 +26,9 @@ else() ) endif() +set(rapids-cmake-repo trxcllnt/rapids-cmake) +set(rapids-cmake-branch fea/cccl-2.5) + if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake") file( DOWNLOAD From f122905d83646b0eea4ad1fecdc775767567bc41 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 14 May 2024 21:34:17 +0000 Subject: [PATCH 13/26] pass cuco::cuda_stream_ref --- cpp/src/join/distinct_hash_join.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index ad401bdccba..0ac2ad8cf5b 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -181,8 +181,12 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, auto const probe_indices_begin = thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{}); - auto const [probe_indices_end, _] = this->_hash_table.retrieve( - iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value()); + auto const [probe_indices_end, _] = + this->_hash_table.retrieve(iter, + iter + probe_table_num_rows, + probe_indices_begin, + build_indices_begin, + cuco::cuda_stream_ref{stream.value()}); auto const actual_size = std::distance(probe_indices_begin, probe_indices_end); build_indices->resize(actual_size, stream); From 434600e22f8a5db6203b99938943b295246ef9f0 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 14 May 2024 21:34:30 +0000 Subject: [PATCH 14/26] revert changes to pr.yaml --- .github/workflows/pr.yaml | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index f684a6156dc..f9d5976f1fe 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -157,24 +157,8 @@ jobs: arch: '["amd64"]' cuda: '["12.2"]' build_command: | - # Tell rapids-cmake to use custom CCCL and cuCollections forks - rapids_branch="$(yq '.x-git-defaults.tag' /opt/rapids-build-utils/manifest.yaml)"; - rapids_version="${rapids_branch#branch-}"; - curl -fsSL -o- https://raw.githubusercontent.com/trxcllnt/rapids-cmake/branch-24.04-cccl-2.4.0/rapids-cmake/cpm/patches/cccl/revert_pr_211.diff \ - | tee ~/rapids-cmake-revert_pr_211.diff; - curl -fsSL -o- "https://raw.githubusercontent.com/rapidsai/rapids-cmake/${rapids_branch}/rapids-cmake/cpm/versions.json" \ - | jq -r ".packages.CCCL *= {\"version\": \"2.5.0\", \"git_tag\": \"pull-request/1667\"}" \ - | jq -r "(.packages.CCCL.patches[] | select(.file == \"cccl/revert_pr_211.diff\")).file = \"${HOME}/rapids-cmake-revert_pr_211.diff\"" \ - | jq -r ".packages.cuco *= {\"git_url\": \"https://github.com/trxcllnt/cuCollections.git\", \"git_tag\": \"rapids-${rapids_version}-cccl-2.5.0\", \"always_download\": true}" \ - | tee ~/rapids-cmake-override-versions.json; sccache -z; - build-all \ - -j$(nproc --ignore=1) -v \ - -DBUILD_TESTS=ON \ - -DBUILD_BENCHMARKS=ON \ - -DCMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0" \ - -DCMAKE_CUDA_FLAGS="-ftemplate-backtrace-limit=0" \ - -DRAPIDS_CMAKE_CPM_DEFAULT_VERSION_FILE="${HOME}/rapids-cmake-override-versions.json"; + build-all -DBUILD_BENCHMARKS=ON --verbose; sccache -s; unit-tests-cudf-pandas: needs: wheel-build-cudf From ffdab59ecdef69a60320ba40a0ef777cfc14811b Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Tue, 21 May 2024 12:51:00 -0700 Subject: [PATCH 15/26] Update cpp/src/join/distinct_hash_join.cu Co-authored-by: Yunsong Wang --- cpp/src/join/distinct_hash_join.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index 0ac2ad8cf5b..c8ecce05e70 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -186,7 +186,7 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, - cuco::cuda_stream_ref{stream.value()}); + {stream.value()}); auto const actual_size = std::distance(probe_indices_begin, probe_indices_end); build_indices->resize(actual_size, stream); From 57d9eeafa12c56ecde236cb05e94ae2af85fa4c9 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 21 May 2024 20:33:04 +0000 Subject: [PATCH 16/26] fix lint --- cpp/src/join/distinct_hash_join.cu | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index c8ecce05e70..5048da25e86 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -181,12 +181,8 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, auto const probe_indices_begin = thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{}); - auto const [probe_indices_end, _] = - this->_hash_table.retrieve(iter, - iter + probe_table_num_rows, - probe_indices_begin, - build_indices_begin, - {stream.value()}); + auto const [probe_indices_end, _] = this->_hash_table.retrieve( + iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()}); auto const actual_size = std::distance(probe_indices_begin, probe_indices_end); build_indices->resize(actual_size, stream); From a011739d3f2a3ff62a2d4bbaf81de31c36446826 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 21 May 2024 22:39:35 +0000 Subject: [PATCH 17/26] Apply suggestions from reviewers --- .pre-commit-config.yaml | 4 ---- cpp/src/groupby/sort/group_rank_scan.cu | 6 +++--- cpp/src/transform/row_bit_count.cu | 4 ++-- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3092dda3e72..2d3ffc287e9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,10 +56,6 @@ repos: - id: clang-format types_or: [c, c++, cuda] args: ["-fallback-style=none", "-style=file", "-i"] - exclude: | - (?x)^( - ^cpp/build/.* - ) - repo: https://github.com/sirosen/texthooks rev: 0.6.6 hooks: diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index 8c6b987f17a..0b65889f127 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -134,10 +134,10 @@ std::unique_ptr rank_generator(column_view const& grouped_values, auto [group_labels_begin, mutable_rank_begin] = [&]() { if constexpr (forward) { - return thrust::make_pair(group_labels.begin(), mutable_ranks.begin()); + return thrust::pair{group_labels.begin(), mutable_ranks.begin()}; } else { - return thrust::make_pair(thrust::reverse_iterator(group_labels.end()), - thrust::reverse_iterator(mutable_ranks.end())); + return thrust::pair{thrust::reverse_iterator(group_labels.end()), + thrust::reverse_iterator(mutable_ranks.end())}; } }(); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 04ac483fd55..bfac7ab586e 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -425,7 +425,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span col // current row span - always starts at spanning over `segment_length` rows. auto const num_rows = cols[0].size(); auto const get_default_row_span = [=] { - return row_span{tid * segment_length, std::min((tid + 1) * segment_length, num_rows)}; + return row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)}; }; auto cur_span = get_default_row_span(); @@ -514,7 +514,7 @@ std::unique_ptr segmented_row_bit_count(table_view const& t, // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. auto const current_length = - std::min(segment_length, num_rows - segment_length * segment_idx); + cuda::std::min(segment_length, num_rows - segment_length * segment_idx); return per_row_size * current_length; })); return output; From 78c1a892b99cb92696908d92083084007596c296 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Tue, 21 May 2024 22:41:51 +0000 Subject: [PATCH 18/26] revert more thrust::pair changes --- cpp/src/io/comp/nvcomp_adapter.cu | 2 +- cpp/src/lists/contains.cu | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 53db714fa68..adb53bbf629 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -108,7 +108,7 @@ void skip_unsupported_inputs(device_span input_sizes, input_sizes.begin(), status_size_it, [] __device__(auto const& status) { - return thrust::make_pair(0, compression_result{0, compression_status::SKIPPED}); + return thrust::pair{0, compression_result{0, compression_status::SKIPPED}}; }, [max_size = max_valid_input_size.value()] __device__(size_t input_size) { return input_size > max_size; diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 0c80d548197..f03d394d6d7 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -105,10 +105,9 @@ __device__ auto element_index_pair_iter(size_type const size) auto const end = thrust::make_counting_iterator(size); if constexpr (forward) { - return thrust::make_pair(begin, end); + return thrust::pair{begin, end}; } else { - return thrust::make_pair(thrust::make_reverse_iterator(end), - thrust::make_reverse_iterator(begin)); + return thrust::pair{thrust::make_reverse_iterator(end), thrust::make_reverse_iterator(begin)}; } } From 68c59973bb903c473195c79fbc00a1c808f85b53 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 22 May 2024 03:43:58 +0000 Subject: [PATCH 19/26] fix lint --- cpp/src/io/comp/nvcomp_adapter.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index adb53bbf629..794d452ebf2 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 114db087f314a712ae9dfb89afcb58f32fed029c Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Wed, 22 May 2024 12:05:06 -0700 Subject: [PATCH 20/26] Apply suggestions from code review --- cpp/src/io/orc/reader_impl_decode.cu | 2 +- cpp/src/io/parquet/page_string_decode.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index c229c00b305..74b90ecd26e 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -692,7 +692,7 @@ std::vector find_table_splits(table_view const& input, d_sizes = d_segmented_sizes->view().begin()] __device__(auto const segment_idx) { // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. - auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx); + auto const current_length = cuda::std::min(segment_length, num_rows - segment_length * segment_idx); auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes return cumulative_size{static_cast(current_length), static_cast(size)}; diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 381badad485..ba3d35b9586 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -1202,7 +1202,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span pages, pages.device_begin(), pages.device_end(), cuda::proclaim_return_type( - [] __device__(auto const& page) { return page.temp_string_size != 0; })); + [] __device__(auto& page) { return page.temp_string_size != 0; })); if (need_sizes) { // sum up all of the temp_string_sizes From 64264a7a3b4b9e35784c712aa86d3fd1f4f3c96a Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 22 May 2024 20:37:08 -0700 Subject: [PATCH 21/26] fix lint --- cpp/src/io/orc/reader_impl_decode.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index 74b90ecd26e..ec936b85761 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -692,7 +692,8 @@ std::vector find_table_splits(table_view const& input, d_sizes = d_segmented_sizes->view().begin()] __device__(auto const segment_idx) { // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. - auto const current_length = cuda::std::min(segment_length, num_rows - segment_length * segment_idx); + auto const current_length = + cuda::std::min(segment_length, num_rows - segment_length * segment_idx); auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes return cumulative_size{static_cast(current_length), static_cast(size)}; From 902f06b6fbe1346e8f5191aa1008d1bbf9644caf Mon Sep 17 00:00:00 2001 From: ptaylor Date: Thu, 23 May 2024 08:49:22 +0000 Subject: [PATCH 22/26] include cuda/std/functional --- cpp/src/io/orc/reader_impl_decode.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index ec936b85761..eeee9c7eca3 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -36,6 +36,7 @@ #include #include +#include #include #include #include From 3b293450ef80e538101e34a1f3cac8a7c29cdcb4 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Thu, 23 May 2024 09:19:43 +0000 Subject: [PATCH 23/26] cuda::std::min -> std::min --- cpp/src/io/orc/reader_impl_decode.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index eeee9c7eca3..c229c00b305 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -36,7 +36,6 @@ #include #include -#include #include #include #include @@ -693,8 +692,7 @@ std::vector find_table_splits(table_view const& input, d_sizes = d_segmented_sizes->view().begin()] __device__(auto const segment_idx) { // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. - auto const current_length = - cuda::std::min(segment_length, num_rows - segment_length * segment_idx); + auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx); auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes return cumulative_size{static_cast(current_length), static_cast(size)}; From 422538e9c5c69bfb964ab6d574a4d25908b254f6 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Fri, 24 May 2024 16:41:12 +0000 Subject: [PATCH 24/26] fix orc tests --- cpp/src/io/orc/reader_impl_decode.cu | 2 +- cpp/src/io/orc/stripe_init.cu | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index c229c00b305..da9fb802a0a 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -692,7 +692,7 @@ std::vector find_table_splits(table_view const& input, d_sizes = d_segmented_sizes->view().begin()] __device__(auto const segment_idx) { // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. - auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx); + auto const current_length = min(segment_length, num_rows - segment_length * segment_idx); auto const size = d_sizes[segment_idx] / CHAR_BIT; // divide by CHAR_BIT to get size in bytes return cumulative_size{static_cast(current_length), static_cast(size)}; diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index dd44b779402..0ec83f57032 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -561,20 +561,24 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info, uint32_t log2maxcr, rmm::cuda_stream_view stream) { - dim3 dim_block(128, 1); - dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block - gpuParseCompressedStripeData<<>>( - strm_info, num_streams, compression_block_size, log2maxcr); + if (num_streams > 0) { + dim3 dim_block(128, 1); + dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block + gpuParseCompressedStripeData<<>>( + strm_info, num_streams, compression_block_size, log2maxcr); + } } void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams, rmm::cuda_stream_view stream) { - dim3 dim_block(128, 1); - dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block - gpuPostDecompressionReassemble<<>>(strm_info, - num_streams); + if (num_streams > 0) { + dim3 dim_block(128, 1); + dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block + gpuPostDecompressionReassemble<<>>(strm_info, + num_streams); + } } void __host__ ParseRowGroupIndex(RowGroup* row_groups, From ef42695aa9d97c6ae274ee1de8e33d7770d36d4b Mon Sep 17 00:00:00 2001 From: ptaylor Date: Fri, 24 May 2024 17:09:08 +0000 Subject: [PATCH 25/26] compute and compare num_blocks instead of num_streams --- cpp/src/io/orc/stripe_init.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 0ec83f57032..89dbbcb796c 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -561,9 +561,10 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info, uint32_t log2maxcr, rmm::cuda_stream_view stream) { - if (num_streams > 0) { + auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block + if (num_blocks > 0) { dim3 dim_block(128, 1); - dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block + dim3 dim_grid(num_blocks, 1); gpuParseCompressedStripeData<<>>( strm_info, num_streams, compression_block_size, log2maxcr); } @@ -573,9 +574,10 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams, rmm::cuda_stream_view stream) { - if (num_streams > 0) { + auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block + if (num_blocks > 0) { dim3 dim_block(128, 1); - dim3 dim_grid((num_streams + 3) >> 2, 1); // 1 stream per warp, 4 warps per block + dim3 dim_grid(num_blocks, 1); gpuPostDecompressionReassemble<<>>(strm_info, num_streams); } From d9a4947eacc8fe9e62d99f8fe8064c16afc1a990 Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Fri, 24 May 2024 13:19:14 -0700 Subject: [PATCH 26/26] revert changes to use rapids-cmake CCCL 2.5 branch --- rapids_config.cmake | 3 --- 1 file changed, 3 deletions(-) diff --git a/rapids_config.cmake b/rapids_config.cmake index e7137bb71dd..3a88769f6e7 100644 --- a/rapids_config.cmake +++ b/rapids_config.cmake @@ -26,9 +26,6 @@ else() ) endif() -set(rapids-cmake-repo trxcllnt/rapids-cmake) -set(rapids-cmake-branch fea/cccl-2.5) - if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake") file( DOWNLOAD