From b74a236e5ec455b8539317304d1908f3ba9220f1 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 17 Apr 2024 17:08:55 +0200
Subject: [PATCH 01/26] Always use `cuda::proclaim_return_type` with device
 lambdas

Co-authored-by: Bernhard Manfred Gruber <bgruber@nvidia.com>
---
 cpp/src/io/comp/statistics.cu                | 9 +++++----
 cpp/src/io/parquet/page_string_decode.cu     | 6 +++---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 4 ++--
 cpp/src/io/utilities/data_casting.cu         | 5 +++--
 4 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu
index 2a9eb782800..e268f7a5eb3 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/statistics.cu
@@ -18,6 +18,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/transform_reduce.h>
 
 namespace cudf::io {
@@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics(
     rmm::exec_policy(stream),
     results.begin(),
     results.end(),
-    [] __device__(auto& res) {
+    cuda::proclaim_return_type<size_t>([] __device__(const compression_result& res) {
       return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
-    },
+    }),
     0ul,
     thrust::plus<size_t>());
 
@@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics(
       rmm::exec_policy(stream),
       zipped_begin,
       zipped_end,
-      [status] __device__(auto tup) {
+      cuda::proclaim_return_type<size_t>([status] __device__(auto tup) {
         return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
-      },
+      }),
       0ul,
       thrust::plus<size_t>());
   };
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 8bb56c66d0f..1fb4741e9ab 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1186,13 +1186,13 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
 
   // check for needed temp space for DELTA_BYTE_ARRAY
   auto const need_sizes = thrust::any_of(
-    rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) {
+    rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), cuda::proclaim_return_type<bool>([] __device__(auto& page) {
       return page.temp_string_size != 0;
-    });
+    }));
 
   if (need_sizes) {
     // sum up all of the temp_string_sizes
-    auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
+    auto const page_sizes = cuda::proclaim_return_type<int64_t>([] __device__(PageInfo const& page) { return page.temp_string_size; });
     auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
                                                      pages.device_begin(),
                                                      pages.device_end(),
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 1b0a10be811..6dee87b7f3e 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding)
 [[nodiscard]] std::string list_unsupported_encodings(device_span<PageInfo const> pages,
                                                      rmm::cuda_stream_view stream)
 {
-  auto const to_mask = [] __device__(auto const& page) {
+  auto const to_mask = cuda::proclaim_return_type<uint32_t>([] __device__(auto const& page) {
     return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding);
-  };
+  });
   uint32_t const unsupported = thrust::transform_reduce(
     rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or<uint32_t>());
   return encoding_bitmask_to_str(unsupported);
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 4b5d47e71fb..bb928085d42 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -33,6 +33,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/transform_reduce.h>
@@ -782,7 +783,7 @@ template <typename SymbolT>
 struct to_string_view_pair {
   SymbolT const* data;
   to_string_view_pair(SymbolT const* _data) : data(_data) {}
-  __device__ auto operator()(thrust::tuple<size_type, size_type> ip)
+  __device__ thrust::pair<char const*, std::size_t> operator()(thrust::tuple<size_type, size_type> ip)
   {
     return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
                                                   static_cast<std::size_t>(thrust::get<1>(ip))};
@@ -804,7 +805,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
     rmm::exec_policy(stream),
     str_tuples,
     str_tuples + col_size,
-    [] __device__(auto t) { return t.second; },
+    cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
     size_type{0},
     thrust::maximum<size_type>{});
 

From 942291c25323c4beb511d1632ea0592db9ce21a5 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 17 Apr 2024 17:09:36 +0200
Subject: [PATCH 02/26] Work around issue with `thrust::pair` that prevents
 CTAD

---
 cpp/src/groupby/sort/group_rank_scan.cu | 6 +++---
 cpp/src/io/comp/nvcomp_adapter.cu       | 2 +-
 cpp/src/lists/contains.cu               | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 5cf7844410e..a040480939d 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -133,10 +133,10 @@ std::unique_ptr<column> rank_generator(column_view const& grouped_values,
 
   auto [group_labels_begin, mutable_rank_begin] = [&]() {
     if constexpr (forward) {
-      return thrust::pair{group_labels.begin(), mutable_ranks.begin<size_type>()};
+      return thrust::make_pair(group_labels.begin(), mutable_ranks.begin<size_type>());
     } else {
-      return thrust::pair{thrust::reverse_iterator(group_labels.end()),
-                          thrust::reverse_iterator(mutable_ranks.end<size_type>())};
+      return thrust::make_pair(thrust::reverse_iterator(group_labels.end()),
+                               thrust::reverse_iterator(mutable_ranks.end<size_type>()));
     }
   }();
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
index 794d452ebf2..1a5c8b9493e 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cu
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -108,7 +108,7 @@ void skip_unsupported_inputs(device_span<size_t> input_sizes,
       input_sizes.begin(),
       status_size_it,
       [] __device__(auto const& status) {
-        return thrust::pair{0, compression_result{0, compression_status::SKIPPED}};
+        return thrust::make_pair(0, compression_result{0, compression_status::SKIPPED});
       },
       [max_size = max_valid_input_size.value()] __device__(size_t input_size) {
         return input_size > max_size;
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 378cf678f1f..8a512f759c9 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -103,9 +103,9 @@ __device__ auto element_index_pair_iter(size_type const size)
   auto const end   = thrust::make_counting_iterator(size);
 
   if constexpr (forward) {
-    return thrust::pair{begin, end};
+    return thrust::make_pair(begin, end);
   } else {
-    return thrust::pair{thrust::make_reverse_iterator(end), thrust::make_reverse_iterator(begin)};
+    return thrust::make_pair(thrust::make_reverse_iterator(end), thrust::make_reverse_iterator(begin));
   }
 }
 

From e172942265b8c07d432c01c4f9088eed1b452178 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 17 Apr 2024 11:25:37 -0500
Subject: [PATCH 03/26] Apply suggestions from code review

---
 cpp/src/io/comp/statistics.cu            | 2 +-
 cpp/src/io/parquet/page_string_decode.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu
index e268f7a5eb3..faf967041bc 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/statistics.cu
@@ -33,7 +33,7 @@ writer_compression_statistics collect_compression_statistics(
     rmm::exec_policy(stream),
     results.begin(),
     results.end(),
-    cuda::proclaim_return_type<size_t>([] __device__(const compression_result& res) {
+    cuda::proclaim_return_type<size_t>([] __device__(compression_result const& res) {
       return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
     }),
     0ul,
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 1fb4741e9ab..3fe227a89f2 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1186,7 +1186,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
 
   // check for needed temp space for DELTA_BYTE_ARRAY
   auto const need_sizes = thrust::any_of(
-    rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), cuda::proclaim_return_type<bool>([] __device__(auto& page) {
+    rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), cuda::proclaim_return_type<bool>([] __device__(auto const& page) {
       return page.temp_string_size != 0;
     }));
 

From 6ca0c2ad26556cf9ac5a875a9035421b617bebfe Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Mon, 22 Apr 2024 20:30:35 +0000
Subject: [PATCH 04/26] fix lint

---
 cpp/src/io/comp/nvcomp_adapter.cu        |  2 +-
 cpp/src/io/parquet/page_string_decode.cu | 13 ++++++++-----
 cpp/src/io/utilities/data_casting.cu     |  3 ++-
 cpp/src/lists/contains.cu                |  3 ++-
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
index 1a5c8b9493e..53db714fa68 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cu
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index f9556fbca84..806a2c4f01f 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1182,14 +1182,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
   cudf::detail::join_streams(streams, stream);
 
   // check for needed temp space for DELTA_BYTE_ARRAY
-  auto const need_sizes = thrust::any_of(
-    rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), cuda::proclaim_return_type<bool>([] __device__(auto const& page) {
-      return page.temp_string_size != 0;
-    }));
+  auto const need_sizes =
+    thrust::any_of(rmm::exec_policy(stream),
+                   pages.device_begin(),
+                   pages.device_end(),
+                   cuda::proclaim_return_type<bool>(
+                     [] __device__(auto const& page) { return page.temp_string_size != 0; }));
 
   if (need_sizes) {
     // sum up all of the temp_string_sizes
-    auto const page_sizes = cuda::proclaim_return_type<int64_t>([] __device__(PageInfo const& page) { return page.temp_string_size; });
+    auto const page_sizes = cuda::proclaim_return_type<int64_t>(
+      [] __device__(PageInfo const& page) { return page.temp_string_size; });
     auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
                                                      pages.device_begin(),
                                                      pages.device_end(),
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 91d7e9859ba..60cbfbc0dae 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -784,7 +784,8 @@ template <typename SymbolT>
 struct to_string_view_pair {
   SymbolT const* data;
   to_string_view_pair(SymbolT const* _data) : data(_data) {}
-  __device__ thrust::pair<char const*, std::size_t> operator()(thrust::tuple<size_type, size_type> ip)
+  __device__ thrust::pair<char const*, std::size_t> operator()(
+    thrust::tuple<size_type, size_type> ip)
   {
     return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
                                                   static_cast<std::size_t>(thrust::get<1>(ip))};
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 6ee97aa18e6..4e8938b52de 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -106,7 +106,8 @@ __device__ auto element_index_pair_iter(size_type const size)
   if constexpr (forward) {
     return thrust::make_pair(begin, end);
   } else {
-    return thrust::make_pair(thrust::make_reverse_iterator(end), thrust::make_reverse_iterator(begin));
+    return thrust::make_pair(thrust::make_reverse_iterator(end),
+                             thrust::make_reverse_iterator(begin));
   }
 }
 

From 04f1990e57a0a2af92328f4dacf6cc60d8a0047d Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 8 May 2024 11:27:57 -0700
Subject: [PATCH 05/26] use std::min instead of cuda::std::min

---
 cpp/src/io/orc/reader_impl_decode.cu | 3 +--
 cpp/src/transform/row_bit_count.cu   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index ec936b85761..c229c00b305 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -692,8 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
      d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
       // Since the number of rows may not divisible by segment_length,
       // the last segment may be shorter than the others.
-      auto const current_length =
-        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx);
       auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
       return cumulative_size{static_cast<std::size_t>(current_length),
                              static_cast<std::size_t>(size)};
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index bfac7ab586e..04ac483fd55 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -425,7 +425,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> col
   // current row span - always starts at spanning over `segment_length` rows.
   auto const num_rows             = cols[0].size();
   auto const get_default_row_span = [=] {
-    return row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
+    return row_span{tid * segment_length, std::min((tid + 1) * segment_length, num_rows)};
   };
   auto cur_span = get_default_row_span();
 
@@ -514,7 +514,7 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
           // Since the number of rows may not divisible by segment_length,
           // the last segment may be shorter than the others.
           auto const current_length =
-            cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+            std::min(segment_length, num_rows - segment_length * segment_idx);
           return per_row_size * current_length;
         }));
     return output;

From 586f5029d9a8039f20ad654ec04cbc209d8061f7 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 8 May 2024 11:33:26 -0700
Subject: [PATCH 06/26] use cuda::proclaim_return_type

---
 cpp/src/strings/split/split_re.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 4dfb3e9ea62..e754e7ed05d 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -219,9 +219,9 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
-    [d_offsets] __device__(auto const idx) -> size_type {
+    cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto const idx) -> size_type {
       return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
-    },
+    }),
     0,
     thrust::maximum<size_type>{});
 

From 3598c8ff3bce66b54d41eb2e1cf63d9186fb6adf Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 8 May 2024 11:45:45 -0700
Subject: [PATCH 07/26] remove test for int16_t key type that's unsupported by
 cuda::atomic_ref

---
 cpp/tests/hash_map/map_test.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index 4b10716706b..be2e33538b9 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture {
 
 using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
                                    key_value_types<int64_t, int64_t>,
-                                   key_value_types<int16_t, int16_t>,
                                    key_value_types<int32_t, float>,
                                    key_value_types<int64_t, double>>;
 

From 69796d5ad54351dbb111875f43d38fa598bebe43 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 8 May 2024 13:03:43 -0700
Subject: [PATCH 08/26] regenerate and add patches that apply to CCCL main

---
 .../thirdparty/patches/cccl_override.json     | 20 ++++++++
 .../patches/revert_pr_211_cccl_2.5.0.diff     | 47 +++++++++++++++++++
 ..._disable_64bit_dispatching_cccl_2.5.0.diff | 25 ++++++++++
 ..._faster_scan_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++
 ..._faster_sort_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++
 5 files changed, 170 insertions(+)
 create mode 100644 cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff

diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index b33f17f3e4a..059f713e7a5 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -18,6 +18,11 @@
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
+          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
+          "fixed_in" : ""
+        },
         {
           "file": "cccl/kernel_pointer_hiding.diff",
           "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
@@ -28,15 +33,30 @@
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
+          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
+          "fixed_in" : ""
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
+          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
+          "fixed_in" : ""
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
           "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
+          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
+          "fixed_in" : ""
         }
       ]
     }
diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
new file mode 100644
index 00000000000..27ff16744f5
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
@@ -0,0 +1,47 @@
+diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+index 046eb83c0..8047c9701 100644
+--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
++++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+@@ -53,41 +53,15 @@ namespace cuda_cub
+ 
+ namespace __copy
+ {
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+-  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
+-{
+-  typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+-  const auto n = thrust::distance(first, last);
+-  if (n > 0)
+-  {
+-    cudaError status;
+-    status = trivial_copy_device_to_device(
+-      policy,
+-      reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
+-      reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
+-      n);
+-    cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
+-  }
+-
+-  return result + n;
+-}
+ 
+ template <class Derived, class InputIt, class OutputIt>
+ OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+-  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
++  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+ {
+   typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+   return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
+ }
+ 
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION
+-device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+-{
+-  return device_to_device(
+-    policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
+-}
+ } // namespace __copy
+ 
+ } // namespace cuda_cub
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
new file mode 100644
index 00000000000..6ae1e1c917b
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
@@ -0,0 +1,25 @@
+diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
+index 2a3cc4e33..8fb337b26 100644
+--- a/thrust/thrust/system/cuda/detail/dispatch.h
++++ b/thrust/thrust/system/cuda/detail/dispatch.h
+@@ -44,8 +44,7 @@
+   }                                                                                   \
+   else                                                                                \
+   {                                                                                   \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+-    status                             = call arguments;                              \
++    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ 
+ /**
+@@ -66,9 +65,7 @@
+   }                                                                                          \
+   else                                                                                       \
+   {                                                                                          \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
+-    status                              = call arguments;                                    \
++    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ /**
+  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
new file mode 100644
index 00000000000..fee46046194
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+index 0606485bb..dbb99ff13 100644
+--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
+   };
+ 
+   /// SM60 (GP100)
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     enum
+     {
+diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
+index f39613adb..75bd16ff9 100644
+--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
+@@ -488,7 +488,7 @@ struct DeviceReducePolicy
+   };
+ 
+   /// SM60
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     static constexpr int threads_per_block  = 256;
+     static constexpr int items_per_thread   = 16;
+diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+index 419908c4e..6ab0840e1 100644
+--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
++++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+@@ -339,7 +339,7 @@ struct DeviceScanPolicy
+   /// SM600
+   struct Policy600
+       : DefaultTuning
+-      , ChainedPolicy<600, Policy600, Policy520>
++      , ChainedPolicy<600, Policy600, Policy600>
+   {};
+ 
+   /// SM800
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
new file mode 100644
index 00000000000..cb0cc55f4d2
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
+index eb76ebb0b..c6c529a50 100644
+--- a/cub/cub/block/block_merge_sort.cuh
++++ b/cub/cub/block/block_merge_sort.cuh
+@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
+   KeyT key1 = keys_shared[keys1_beg];
+   KeyT key2 = keys_shared[keys2_beg];
+ 
+-#pragma unroll
++#pragma unroll 1
+   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+   {
+     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -376,7 +376,7 @@ public:
+       //
+       KeyT max_key = oob_default;
+ 
+-#pragma unroll
++#pragma unroll 1
+       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+       {
+         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
+index 7d9e8622f..da5627306 100644
+--- a/cub/cub/thread/thread_sort.cuh
++++ b/cub/cub/thread/thread_sort.cuh
+@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
+ {
+   constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+ 
+-#pragma unroll
++#pragma unroll 1
+   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+   {
+-#pragma unroll
++#pragma unroll 1
+     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+     {
+       if (compare_op(keys[j + 1], keys[j]))

From 8c43425352485c2fb09da0cc31f718818cf327a7 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 8 May 2024 13:12:18 -0700
Subject: [PATCH 09/26] don't modify whitespace in patches

---
 .pre-commit-config.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0ae745257cb..f76eb6f2652 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,11 +7,13 @@ repos:
       - id: trailing-whitespace
         exclude: |
           (?x)^(
+            ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
       - id: end-of-file-fixer
         exclude: |
           (?x)^(
+            ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
   - repo: https://github.com/PyCQA/isort

From 991c78953fd3aebc11c60891e7e391f366808f15 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 8 May 2024 13:18:27 -0700
Subject: [PATCH 10/26] don't run clang-format on files in cpp/build/*

---
 .pre-commit-config.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f76eb6f2652..cd082b8282a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,6 +53,10 @@ repos:
       - id: clang-format
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
+        exclude: |
+          (?x)^(
+            ^cpp/build/.*
+          )
   - repo: https://github.com/sirosen/texthooks
     rev: 0.6.6
     hooks:

From 9320968bdb2acd37dfc74b77a27f79008d093935 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Thu, 9 May 2024 20:28:50 +0000
Subject: [PATCH 11/26] update devcontainer workflow to use
 NVIDIA/cccl#pull-request/1667

---
 .github/workflows/pr.yaml | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f9d5976f1fe..f684a6156dc 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -157,8 +157,24 @@ jobs:
       arch: '["amd64"]'
       cuda: '["12.2"]'
       build_command: |
+        # Tell rapids-cmake to use custom CCCL and cuCollections forks
+        rapids_branch="$(yq '.x-git-defaults.tag' /opt/rapids-build-utils/manifest.yaml)";
+        rapids_version="${rapids_branch#branch-}";
+        curl -fsSL -o- https://raw.githubusercontent.com/trxcllnt/rapids-cmake/branch-24.04-cccl-2.4.0/rapids-cmake/cpm/patches/cccl/revert_pr_211.diff \
+          | tee ~/rapids-cmake-revert_pr_211.diff;
+        curl -fsSL -o- "https://raw.githubusercontent.com/rapidsai/rapids-cmake/${rapids_branch}/rapids-cmake/cpm/versions.json" \
+          | jq -r ".packages.CCCL *= {\"version\": \"2.5.0\", \"git_tag\": \"pull-request/1667\"}" \
+          | jq -r "(.packages.CCCL.patches[] | select(.file == \"cccl/revert_pr_211.diff\")).file = \"${HOME}/rapids-cmake-revert_pr_211.diff\"" \
+          | jq -r ".packages.cuco *= {\"git_url\": \"https://github.com/trxcllnt/cuCollections.git\", \"git_tag\": \"rapids-${rapids_version}-cccl-2.5.0\", \"always_download\": true}" \
+          | tee ~/rapids-cmake-override-versions.json;
         sccache -z;
-        build-all -DBUILD_BENCHMARKS=ON --verbose;
+        build-all \
+          -j$(nproc --ignore=1) -v \
+          -DBUILD_TESTS=ON \
+          -DBUILD_BENCHMARKS=ON \
+          -DCMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0" \
+          -DCMAKE_CUDA_FLAGS="-ftemplate-backtrace-limit=0" \
+          -DRAPIDS_CMAKE_CPM_DEFAULT_VERSION_FILE="${HOME}/rapids-cmake-override-versions.json";
         sccache -s;
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf

From 4ff0c5968fae67e89729bb5515067ffcd6d6740b Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Tue, 14 May 2024 21:33:56 +0000
Subject: [PATCH 12/26] test rapids-cmake with CCCL 2.5

---
 rapids_config.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rapids_config.cmake b/rapids_config.cmake
index 3a88769f6e7..e7137bb71dd 100644
--- a/rapids_config.cmake
+++ b/rapids_config.cmake
@@ -26,6 +26,9 @@ else()
   )
 endif()
 
+set(rapids-cmake-repo trxcllnt/rapids-cmake)
+set(rapids-cmake-branch fea/cccl-2.5)
+
 if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
   file(
     DOWNLOAD

From f122905d83646b0eea4ad1fecdc775767567bc41 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Tue, 14 May 2024 21:34:17 +0000
Subject: [PATCH 13/26] pass cuco::cuda_stream_ref

---
 cpp/src/join/distinct_hash_join.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index ad401bdccba..0ac2ad8cf5b 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -181,8 +181,12 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   auto const probe_indices_begin =
     thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
 
-  auto const [probe_indices_end, _] = this->_hash_table.retrieve(
-    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
+  auto const [probe_indices_end, _] =
+    this->_hash_table.retrieve(iter,
+                               iter + probe_table_num_rows,
+                               probe_indices_begin,
+                               build_indices_begin,
+                               cuco::cuda_stream_ref{stream.value()});
 
   auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);

From 434600e22f8a5db6203b99938943b295246ef9f0 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Tue, 14 May 2024 21:34:30 +0000
Subject: [PATCH 14/26] revert changes to pr.yaml

---
 .github/workflows/pr.yaml | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f684a6156dc..f9d5976f1fe 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -157,24 +157,8 @@ jobs:
       arch: '["amd64"]'
       cuda: '["12.2"]'
       build_command: |
-        # Tell rapids-cmake to use custom CCCL and cuCollections forks
-        rapids_branch="$(yq '.x-git-defaults.tag' /opt/rapids-build-utils/manifest.yaml)";
-        rapids_version="${rapids_branch#branch-}";
-        curl -fsSL -o- https://raw.githubusercontent.com/trxcllnt/rapids-cmake/branch-24.04-cccl-2.4.0/rapids-cmake/cpm/patches/cccl/revert_pr_211.diff \
-          | tee ~/rapids-cmake-revert_pr_211.diff;
-        curl -fsSL -o- "https://raw.githubusercontent.com/rapidsai/rapids-cmake/${rapids_branch}/rapids-cmake/cpm/versions.json" \
-          | jq -r ".packages.CCCL *= {\"version\": \"2.5.0\", \"git_tag\": \"pull-request/1667\"}" \
-          | jq -r "(.packages.CCCL.patches[] | select(.file == \"cccl/revert_pr_211.diff\")).file = \"${HOME}/rapids-cmake-revert_pr_211.diff\"" \
-          | jq -r ".packages.cuco *= {\"git_url\": \"https://github.com/trxcllnt/cuCollections.git\", \"git_tag\": \"rapids-${rapids_version}-cccl-2.5.0\", \"always_download\": true}" \
-          | tee ~/rapids-cmake-override-versions.json;
         sccache -z;
-        build-all \
-          -j$(nproc --ignore=1) -v \
-          -DBUILD_TESTS=ON \
-          -DBUILD_BENCHMARKS=ON \
-          -DCMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0" \
-          -DCMAKE_CUDA_FLAGS="-ftemplate-backtrace-limit=0" \
-          -DRAPIDS_CMAKE_CPM_DEFAULT_VERSION_FILE="${HOME}/rapids-cmake-override-versions.json";
+        build-all -DBUILD_BENCHMARKS=ON --verbose;
         sccache -s;
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf

From ffdab59ecdef69a60320ba40a0ef777cfc14811b Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Tue, 21 May 2024 12:51:00 -0700
Subject: [PATCH 15/26] Update cpp/src/join/distinct_hash_join.cu

Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 cpp/src/join/distinct_hash_join.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 0ac2ad8cf5b..c8ecce05e70 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -186,7 +186,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
                                iter + probe_table_num_rows,
                                probe_indices_begin,
                                build_indices_begin,
-                               cuco::cuda_stream_ref{stream.value()});
+                               {stream.value()});
 
   auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);

From 57d9eeafa12c56ecde236cb05e94ae2af85fa4c9 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Tue, 21 May 2024 20:33:04 +0000
Subject: [PATCH 16/26] fix lint

---
 cpp/src/join/distinct_hash_join.cu | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index c8ecce05e70..5048da25e86 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -181,12 +181,8 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   auto const probe_indices_begin =
     thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
 
-  auto const [probe_indices_end, _] =
-    this->_hash_table.retrieve(iter,
-                               iter + probe_table_num_rows,
-                               probe_indices_begin,
-                               build_indices_begin,
-                               {stream.value()});
+  auto const [probe_indices_end, _] = this->_hash_table.retrieve(
+    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});
 
   auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);

From a011739d3f2a3ff62a2d4bbaf81de31c36446826 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Tue, 21 May 2024 22:39:35 +0000
Subject: [PATCH 17/26] Apply suggestions from reviewers

---
 .pre-commit-config.yaml                 | 4 ----
 cpp/src/groupby/sort/group_rank_scan.cu | 6 +++---
 cpp/src/transform/row_bit_count.cu      | 4 ++--
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3092dda3e72..2d3ffc287e9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -56,10 +56,6 @@ repos:
       - id: clang-format
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
-        exclude: |
-          (?x)^(
-            ^cpp/build/.*
-          )
   - repo: https://github.com/sirosen/texthooks
     rev: 0.6.6
     hooks:
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 8c6b987f17a..0b65889f127 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -134,10 +134,10 @@ std::unique_ptr<column> rank_generator(column_view const& grouped_values,
 
   auto [group_labels_begin, mutable_rank_begin] = [&]() {
     if constexpr (forward) {
-      return thrust::make_pair(group_labels.begin(), mutable_ranks.begin<size_type>());
+      return thrust::pair{group_labels.begin(), mutable_ranks.begin<size_type>()};
     } else {
-      return thrust::make_pair(thrust::reverse_iterator(group_labels.end()),
-                               thrust::reverse_iterator(mutable_ranks.end<size_type>()));
+      return thrust::pair{thrust::reverse_iterator(group_labels.end()),
+                          thrust::reverse_iterator(mutable_ranks.end<size_type>())};
     }
   }();
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 04ac483fd55..bfac7ab586e 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -425,7 +425,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> col
   // current row span - always starts at spanning over `segment_length` rows.
   auto const num_rows             = cols[0].size();
   auto const get_default_row_span = [=] {
-    return row_span{tid * segment_length, std::min((tid + 1) * segment_length, num_rows)};
+    return row_span{tid * segment_length, cuda::std::min((tid + 1) * segment_length, num_rows)};
   };
   auto cur_span = get_default_row_span();
 
@@ -514,7 +514,7 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
           // Since the number of rows may not divisible by segment_length,
           // the last segment may be shorter than the others.
           auto const current_length =
-            std::min(segment_length, num_rows - segment_length * segment_idx);
+            cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
           return per_row_size * current_length;
         }));
     return output;

From 78c1a892b99cb92696908d92083084007596c296 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Tue, 21 May 2024 22:41:51 +0000
Subject: [PATCH 18/26] revert more thrust::pair changes

---
 cpp/src/io/comp/nvcomp_adapter.cu | 2 +-
 cpp/src/lists/contains.cu         | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
index 53db714fa68..adb53bbf629 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cu
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -108,7 +108,7 @@ void skip_unsupported_inputs(device_span<size_t> input_sizes,
       input_sizes.begin(),
       status_size_it,
       [] __device__(auto const& status) {
-        return thrust::make_pair(0, compression_result{0, compression_status::SKIPPED});
+        return thrust::pair{0, compression_result{0, compression_status::SKIPPED}};
       },
       [max_size = max_valid_input_size.value()] __device__(size_t input_size) {
         return input_size > max_size;
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 0c80d548197..f03d394d6d7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -105,10 +105,9 @@ __device__ auto element_index_pair_iter(size_type const size)
   auto const end   = thrust::make_counting_iterator(size);
 
   if constexpr (forward) {
-    return thrust::make_pair(begin, end);
+    return thrust::pair{begin, end};
   } else {
-    return thrust::make_pair(thrust::make_reverse_iterator(end),
-                             thrust::make_reverse_iterator(begin));
+    return thrust::pair{thrust::make_reverse_iterator(end), thrust::make_reverse_iterator(begin)};
   }
 }
 

From 68c59973bb903c473195c79fbc00a1c808f85b53 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 22 May 2024 03:43:58 +0000
Subject: [PATCH 19/26] fix lint

---
 cpp/src/io/comp/nvcomp_adapter.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
index adb53bbf629..794d452ebf2 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cu
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 114db087f314a712ae9dfb89afcb58f32fed029c Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 22 May 2024 12:05:06 -0700
Subject: [PATCH 20/26] Apply suggestions from code review

---
 cpp/src/io/orc/reader_impl_decode.cu     | 2 +-
 cpp/src/io/parquet/page_string_decode.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index c229c00b305..74b90ecd26e 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -692,7 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
      d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
       // Since the number of rows may not divisible by segment_length,
       // the last segment may be shorter than the others.
-      auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const current_length = cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
       auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
       return cumulative_size{static_cast<std::size_t>(current_length),
                              static_cast<std::size_t>(size)};
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 381badad485..ba3d35b9586 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1202,7 +1202,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
                    pages.device_begin(),
                    pages.device_end(),
                    cuda::proclaim_return_type<bool>(
-                     [] __device__(auto const& page) { return page.temp_string_size != 0; }));
+                     [] __device__(auto& page) { return page.temp_string_size != 0; }));
 
   if (need_sizes) {
     // sum up all of the temp_string_sizes

From 64264a7a3b4b9e35784c712aa86d3fd1f4f3c96a Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 22 May 2024 20:37:08 -0700
Subject: [PATCH 21/26] fix lint

---
 cpp/src/io/orc/reader_impl_decode.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 74b90ecd26e..ec936b85761 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -692,7 +692,8 @@ std::vector<range> find_table_splits(table_view const& input,
      d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
       // Since the number of rows may not divisible by segment_length,
       // the last segment may be shorter than the others.
-      auto const current_length = cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const current_length =
+        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
       auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
       return cumulative_size{static_cast<std::size_t>(current_length),
                              static_cast<std::size_t>(size)};

From 902f06b6fbe1346e8f5191aa1008d1bbf9644caf Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Thu, 23 May 2024 08:49:22 +0000
Subject: [PATCH 22/26] include cuda/std/functional

---
 cpp/src/io/orc/reader_impl_decode.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index ec936b85761..eeee9c7eca3 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -36,6 +36,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/copy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>

From 3b293450ef80e538101e34a1f3cac8a7c29cdcb4 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Thu, 23 May 2024 09:19:43 +0000
Subject: [PATCH 23/26] cuda::std::min -> std::min

---
 cpp/src/io/orc/reader_impl_decode.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index eeee9c7eca3..c229c00b305 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -36,7 +36,6 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <cuda/std/functional>
 #include <thrust/copy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -693,8 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
      d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
       // Since the number of rows may not divisible by segment_length,
       // the last segment may be shorter than the others.
-      auto const current_length =
-        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx);
       auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
       return cumulative_size{static_cast<std::size_t>(current_length),
                              static_cast<std::size_t>(size)};

From 422538e9c5c69bfb964ab6d574a4d25908b254f6 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Fri, 24 May 2024 16:41:12 +0000
Subject: [PATCH 24/26] fix orc tests

---
 cpp/src/io/orc/reader_impl_decode.cu |  2 +-
 cpp/src/io/orc/stripe_init.cu        | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index c229c00b305..da9fb802a0a 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -692,7 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
      d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
       // Since the number of rows may not divisible by segment_length,
       // the last segment may be shorter than the others.
-      auto const current_length = std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const current_length = min(segment_length, num_rows - segment_length * segment_idx);
       auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
       return cumulative_size{static_cast<std::size_t>(current_length),
                              static_cast<std::size_t>(size)};
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index dd44b779402..0ec83f57032 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -561,20 +561,24 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         uint32_t log2maxcr,
                                         rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
-    strm_info, num_streams, compression_block_size, log2maxcr);
+  if (num_streams > 0) {
+    dim3 dim_block(128, 1);
+    dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
+    gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
+      strm_info, num_streams, compression_block_size, log2maxcr);
+  }
 }
 
 void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                           int32_t num_streams,
                                           rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
-                                                                             num_streams);
+  if (num_streams > 0) {
+    dim3 dim_block(128, 1);
+    dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
+    gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
+                                                                               num_streams);
+  }
 }
 
 void __host__ ParseRowGroupIndex(RowGroup* row_groups,

From ef42695aa9d97c6ae274ee1de8e33d7770d36d4b Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Fri, 24 May 2024 17:09:08 +0000
Subject: [PATCH 25/26] compute and compare num_blocks instead of num_streams

---
 cpp/src/io/orc/stripe_init.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 0ec83f57032..89dbbcb796c 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -561,9 +561,10 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         uint32_t log2maxcr,
                                         rmm::cuda_stream_view stream)
 {
-  if (num_streams > 0) {
+  auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
+  if (num_blocks > 0) {
     dim3 dim_block(128, 1);
-    dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
+    dim3 dim_grid(num_blocks, 1);
     gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
       strm_info, num_streams, compression_block_size, log2maxcr);
   }
@@ -573,9 +574,10 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                           int32_t num_streams,
                                           rmm::cuda_stream_view stream)
 {
-  if (num_streams > 0) {
+  auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
+  if (num_blocks > 0) {
     dim3 dim_block(128, 1);
-    dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
+    dim3 dim_grid(num_blocks, 1);
     gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
                                                                                num_streams);
   }

From d9a4947eacc8fe9e62d99f8fe8064c16afc1a990 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Fri, 24 May 2024 13:19:14 -0700
Subject: [PATCH 26/26] revert changes to use rapids-cmake CCCL 2.5 branch

---
 rapids_config.cmake | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/rapids_config.cmake b/rapids_config.cmake
index e7137bb71dd..3a88769f6e7 100644
--- a/rapids_config.cmake
+++ b/rapids_config.cmake
@@ -26,9 +26,6 @@ else()
   )
 endif()
 
-set(rapids-cmake-repo trxcllnt/rapids-cmake)
-set(rapids-cmake-branch fea/cccl-2.5)
-
 if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS-${RAPIDS_VERSION_MAJOR_MINOR}.cmake")
   file(
     DOWNLOAD