Abstracting block reduce and block scan from cuIO kernels with cub …

…apis (#7278) closes #6238 This PR replaces existing usages of `warp_reduce` or `warp_scans` which were used for block reduction/scan with `cub::BlockReduce/cub::BlockScan`. The changes has positive effect on mostly on numerical data processing, but seems to be little slower in case of string type. [all files.zip](https://github.com/rapidsai/cudf/files/5921314/all.files.zip) Update: Graphs have been updated after fixing a bug which also resolved several other performance issues. <details> <summary>Perf plots</summary> **Benchmark Performance** y-axis is in `ms` and there are three sets of plot, one which compares mean performance change next to each other, which also has error bars which is standard deviation calculated using five sets of benchmarks. Next one is difference of performance between cub::block_reduce/cub::block_scan with generic approach and the last one is percentage change in performance compared to branch-0.18. If the value is positive, then test is taking less time, else it is taking more time compared to main branch. **CSV READER** ![CSV_READER_comp_0_9](https://user-images.githubusercontent.com/42624703/106804864-90914b00-662b-11eb-8e5c-10510e4d4666.png) ![CSV_READER_comp_10_19](https://user-images.githubusercontent.com/42624703/106804866-9129e180-662b-11eb-8deb-a5090e4e2d9a.png) ![CSV_READER_diff_0_39](https://user-images.githubusercontent.com/42624703/106804867-9129e180-662b-11eb-95d3-5415cfe2a58d.png) ![CSV_READER_per_0_39](https://user-images.githubusercontent.com/42624703/106804869-91c27800-662b-11eb-8ecf-2bf16ccc8c00.png) ![ORC_WRITER_comp_40_49](https://user-images.githubusercontent.com/42624703/106804870-91c27800-662b-11eb-8cad-8d06fd299ce8.png) **ORC READER** ![ORC_READER_comp_0_9](https://user-images.githubusercontent.com/42624703/107102359-f0c8ee00-67df-11eb-9140-4f9f1147ee02.png) ![ORC_READER_comp_10_19](https://user-images.githubusercontent.com/42624703/107102360-f1618480-67df-11eb-80d5-eab360c80d85.png) ![ORC_READER_comp_20_29](https://user-images.githubusercontent.com/42624703/107102361-f1618480-67df-11eb-8729-d725a8beb91e.png) ![ORC_READER_comp_30_39](https://user-images.githubusercontent.com/42624703/107102362-f1fa1b00-67df-11eb-8541-59e6643791b5.png) ![ORC_READER_comp_40_49](https://user-images.githubusercontent.com/42624703/107102363-f1fa1b00-67df-11eb-864f-f99f2346404b.png) ![ORC_READER_comp_50_59](https://user-images.githubusercontent.com/42624703/107102364-f1fa1b00-67df-11eb-962d-9f71db2b6caf.png) ![ORC_READER_comp_60_69](https://user-images.githubusercontent.com/42624703/107102365-f1fa1b00-67df-11eb-8447-5cc17af20e51.png) ![ORC_READER_comp_70_79](https://user-images.githubusercontent.com/42624703/107102366-f292b180-67df-11eb-9d0c-730be0741eac.png) ![ORC_READER_diff_0_39](https://user-images.githubusercontent.com/42624703/107102368-f292b180-67df-11eb-8b0b-0ab94f06276b.png) ![ORC_READER_diff_40_79](https://user-images.githubusercontent.com/42624703/107102370-f292b180-67df-11eb-9d57-ce820dd586c6.png) ![ORC_READER_per_0_39](https://user-images.githubusercontent.com/42624703/107102371-f32b4800-67df-11eb-80ad-317ee8b29a51.png) ![ORC_READER_per_40_79](https://user-images.githubusercontent.com/42624703/107102372-f32b4800-67df-11eb-8180-fe59a4c2d5da.png) **ORC WRITER** ![ORC_WRITER_comp_0_9](https://user-images.githubusercontent.com/42624703/107102389-03432780-67e0-11eb-8782-a71ddadb72f3.png) ![ORC_WRITER_comp_10_19](https://user-images.githubusercontent.com/42624703/107102390-03dbbe00-67e0-11eb-8825-4896cffabb49.png) ![ORC_WRITER_comp_20_29](https://user-images.githubusercontent.com/42624703/107102392-03dbbe00-67e0-11eb-94dd-6380105f25e8.png) ![ORC_WRITER_comp_30_39](https://user-images.githubusercontent.com/42624703/107102393-03dbbe00-67e0-11eb-853f-6d2d518c038f.png) ![ORC_WRITER_comp_40_49](https://user-images.githubusercontent.com/42624703/107102394-04745480-67e0-11eb-8be4-48a3efdbf446.png) ![ORC_WRITER_comp_50_59](https://user-images.githubusercontent.com/42624703/107102395-04745480-67e0-11eb-8272-4563f28e9bf6.png) ![ORC_WRITER_comp_60_69](https://user-images.githubusercontent.com/42624703/107102396-04745480-67e0-11eb-9597-58142b1dfa99.png) ![ORC_WRITER_comp_70_79](https://user-images.githubusercontent.com/42624703/107102397-050ceb00-67e0-11eb-8e9f-56305d11cfbc.png) ![ORC_WRITER_comp_80_89](https://user-images.githubusercontent.com/42624703/107102398-050ceb00-67e0-11eb-912e-1cea0a96721b.png) ![ORC_WRITER_comp_90_99](https://user-images.githubusercontent.com/42624703/107102399-050ceb00-67e0-11eb-86f4-98de207ec87d.png) ![ORC_WRITER_diff_0_39](https://user-images.githubusercontent.com/42624703/107102401-05a58180-67e0-11eb-911c-66ea27d49633.png) ![ORC_WRITER_diff_40_79](https://user-images.githubusercontent.com/42624703/107102402-05a58180-67e0-11eb-8d3b-5a9f51620ae2.png) ![ORC_WRITER_diff_80_119](https://user-images.githubusercontent.com/42624703/107102403-05a58180-67e0-11eb-9e6c-9a0d3873502b.png) ![ORC_WRITER_per_0_39](https://user-images.githubusercontent.com/42624703/107102404-063e1800-67e0-11eb-9534-646db30944b6.png) ![ORC_WRITER_per_40_79](https://user-images.githubusercontent.com/42624703/107102406-063e1800-67e0-11eb-89bb-b77e5c627f68.png) ![ORC_WRITER_per_80_119](https://user-images.githubusercontent.com/42624703/107102407-063e1800-67e0-11eb-9be3-3d5bb106525c.png) **PARQUET CHUNKED WRITER** ![PQ_CHUNK_WRITER_comp_0_9](https://user-images.githubusercontent.com/42624703/107102412-0b02cc00-67e0-11eb-9038-162b4bcfe2a7.png) ![PQ_CHUNK_WRITER_diff_0_39](https://user-images.githubusercontent.com/42624703/107102413-0b9b6280-67e0-11eb-8af1-a06c1b15f987.png) ![PQ_CHUNK_WRITER_per_0_39](https://user-images.githubusercontent.com/42624703/107102415-0b9b6280-67e0-11eb-860b-f8ac579af9c8.png) **PARQUET READER** ![PQ_READER_comp_0_9](https://user-images.githubusercontent.com/42624703/107102421-13f39d80-67e0-11eb-9bd9-5d45094a4cc8.png) ![PQ_READER_comp_10_19](https://user-images.githubusercontent.com/42624703/107102422-148c3400-67e0-11eb-93d7-d37b6f8f0346.png) ![PQ_READER_comp_20_29](https://user-images.githubusercontent.com/42624703/107102423-148c3400-67e0-11eb-9ed0-3c613311164b.png) ![PQ_READER_comp_30_39](https://user-images.githubusercontent.com/42624703/107102424-1524ca80-67e0-11eb-8773-328871b75e6f.png) ![PQ_READER_comp_40_49](https://user-images.githubusercontent.com/42624703/107102425-1524ca80-67e0-11eb-9841-dd2fae2e00e3.png) ![PQ_READER_comp_50_59](https://user-images.githubusercontent.com/42624703/107102426-1524ca80-67e0-11eb-8cb3-36cde66909c0.png) ![PQ_READER_comp_60_69](https://user-images.githubusercontent.com/42624703/107102429-15bd6100-67e0-11eb-9825-5eb65d066343.png) ![PQ_READER_comp_70_79](https://user-images.githubusercontent.com/42624703/107102430-15bd6100-67e0-11eb-9570-e3f3f4df6e0a.png) ![PQ_READER_comp_80_89](https://user-images.githubusercontent.com/42624703/107102431-15bd6100-67e0-11eb-825e-633f33713844.png) ![PQ_READER_comp_90_99](https://user-images.githubusercontent.com/42624703/107102432-1655f780-67e0-11eb-9e51-4fbca0c640fe.png) ![PQ_READER_diff_0_39](https://user-images.githubusercontent.com/42624703/107102433-1655f780-67e0-11eb-9c61-457d7bdca759.png) ![PQ_READER_diff_40_79](https://user-images.githubusercontent.com/42624703/107102434-1655f780-67e0-11eb-9471-7b071bfc8e72.png) ![PQ_READER_diff_80_119](https://user-images.githubusercontent.com/42624703/107102435-16ee8e00-67e0-11eb-83a5-5ee304fdf8b7.png) ![PQ_READER_per_0_39](https://user-images.githubusercontent.com/42624703/107102436-16ee8e00-67e0-11eb-9df2-dbe74defd858.png) ![PQ_READER_per_40_79](https://user-images.githubusercontent.com/42624703/107102438-17872480-67e0-11eb-81d0-9611b2f554c4.png) ![PQ_READER_per_80_119](https://user-images.githubusercontent.com/42624703/107102441-17872480-67e0-11eb-81c7-a30ffc3d74a5.png) **PARQUET WRITER** ![PQ_WRITER_comp_0_9](https://user-images.githubusercontent.com/42624703/107102450-22da5000-67e0-11eb-8cfe-8198bdfef3b5.png) ![PQ_WRITER_comp_10_19](https://user-images.githubusercontent.com/42624703/107102451-2372e680-67e0-11eb-9036-f10cdf577e7e.png) ![PQ_WRITER_comp_20_29](https://user-images.githubusercontent.com/42624703/107102452-2372e680-67e0-11eb-966a-dd584ba24aa6.png) ![PQ_WRITER_comp_30_39](https://user-images.githubusercontent.com/42624703/107102453-240b7d00-67e0-11eb-9cd6-f2ee98eb679e.png) ![PQ_WRITER_comp_40_49](https://user-images.githubusercontent.com/42624703/107102454-240b7d00-67e0-11eb-87bd-0ed78398394b.png) ![PQ_WRITER_comp_50_59](https://user-images.githubusercontent.com/42624703/107102455-240b7d00-67e0-11eb-8769-ebf545d1f37d.png) ![PQ_WRITER_comp_60_69](https://user-images.githubusercontent.com/42624703/107102456-24a41380-67e0-11eb-82be-00729da0fad0.png) ![PQ_WRITER_comp_70_79](https://user-images.githubusercontent.com/42624703/107102457-24a41380-67e0-11eb-8117-7704b7c7b085.png) ![PQ_WRITER_comp_80_89](https://user-images.githubusercontent.com/42624703/107102458-24a41380-67e0-11eb-99de-df5b077b6b5e.png) ![PQ_WRITER_comp_90_99](https://user-images.githubusercontent.com/42624703/107102459-253caa00-67e0-11eb-9393-2f2c54021bab.png) ![PQ_WRITER_comp_100_109](https://user-images.githubusercontent.com/42624703/107102462-253caa00-67e0-11eb-9c22-d7c48b2bbb02.png) ![PQ_WRITER_comp_110_119](https://user-images.githubusercontent.com/42624703/107102464-253caa00-67e0-11eb-896d-52375e61c528.png) ![PQ_WRITER_comp_120_129](https://user-images.githubusercontent.com/42624703/107102465-25d54080-67e0-11eb-9ab7-eb37f1986c7c.png) ![PQ_WRITER_comp_130_139](https://user-images.githubusercontent.com/42624703/107102466-25d54080-67e0-11eb-8b68-9eb660a81a9b.png) ![PQ_WRITER_diff_0_39](https://user-images.githubusercontent.com/42624703/107102467-25d54080-67e0-11eb-989d-4c1994bb127b.png) ![PQ_WRITER_diff_40_79](https://user-images.githubusercontent.com/42624703/107102468-266dd700-67e0-11eb-98c1-7b6af9155cb4.png) ![PQ_WRITER_diff_80_119](https://user-images.githubusercontent.com/42624703/107102469-266dd700-67e0-11eb-921d-f7e83009c68f.png) ![PQ_WRITER_diff_120_159](https://user-images.githubusercontent.com/42624703/107102470-266dd700-67e0-11eb-8f4f-42f920d30bbc.png) ![PQ_WRITER_per_0_39](https://user-images.githubusercontent.com/42624703/107102472-27066d80-67e0-11eb-9e40-aa06098bbdae.png) ![PQ_WRITER_per_40_79](https://user-images.githubusercontent.com/42624703/107102474-27066d80-67e0-11eb-9227-aff94e67ecfc.png) ![PQ_WRITER_per_80_119](https://user-images.githubusercontent.com/42624703/107102476-279f0400-67e0-11eb-91c5-ad01f1aa09ea.png) ![PQ_WRITER_per_120_159](https://user-images.githubusercontent.com/42624703/107102477-279f0400-67e0-11eb-89ea-c9a5e113c628.png) Authors: - Ram (Ramakrishna Prabhu) (@rgsl888prabhu) Approvers: - Devavret Makkar (@devavret) - Vukasin Milovanovic (@vuule) URL: #7278
rapidsai · Feb 12, 2021 · 7c609d2 · 7c609d2
1 parent f3bf0e5
commit 7c609d2
Show file tree

Hide file tree

Showing 9 changed files with 354 additions and 619 deletions.
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
@@ -860,13 +860,11 @@ __global__ void __launch_bounds__(rowofs_block_dim)
                          int escapechar,
                          int commentchar)
 {
-  auto start = data.begin();
-  __shared__ __align__(8) uint64_t ctxtree[rowofs_block_dim * 2];
-  using warp_reduce      = typename cub::WarpReduce<uint32_t>;
-  using half_warp_reduce = typename cub::WarpReduce<uint32_t, 16>;
+  auto start         = data.begin();
+  using block_reduce = typename cub::BlockReduce<uint32_t, rowofs_block_dim>;
   __shared__ union {
-    typename warp_reduce::TempStorage full;
-    typename half_warp_reduce::TempStorage half[rowofs_block_dim / 32];
+    typename block_reduce::TempStorage bk_storage;
+    __align__(8) uint64_t ctxtree[rowofs_block_dim * 2];
   } temp_storage;
 
   const char *end = start + (min(parse_pos + chunk_size, data_size) - start_offset);
@@ -936,16 +934,16 @@ __global__ void __launch_bounds__(rowofs_block_dim)
   // Convert the long-form {rowmap,outctx}[inctx] version into packed version
   // {rowcount,ouctx}[inctx], then merge the row contexts of the 32-character blocks into
   // a single 16K-character block context
-  rowctx_merge_transform(ctxtree, pack_rowmaps(ctx_map), t);
+  rowctx_merge_transform(temp_storage.ctxtree, pack_rowmaps(ctx_map), t);
 
   // If this is the second phase, get the block's initial parser state and row counter
   if (offsets_out.data()) {
-    if (t == 0) { ctxtree[0] = row_ctx[blockIdx.x]; }
+    if (t == 0) { temp_storage.ctxtree[0] = row_ctx[blockIdx.x]; }
     __syncthreads();
 
     // Walk back the transform tree with the known initial parser state
-    rowctx32_t ctx             = rowctx_inverse_merge_transform(ctxtree, t);
-    uint64_t row               = (ctxtree[0] >> 2) + (ctx >> 2);
+    rowctx32_t ctx             = rowctx_inverse_merge_transform(temp_storage.ctxtree, t);
+    uint64_t row               = (temp_storage.ctxtree[0] >> 2) + (ctx >> 2);
     uint32_t rows_out_of_range = 0;
     uint32_t rowmap            = select_rowmap(ctx_map, ctx & 3);
     // Output row positions
@@ -960,18 +958,13 @@ __global__ void __launch_bounds__(rowofs_block_dim)
       row++;
       rowmap >>= pos;
     }
-    // Return the number of rows out of range
-    rows_out_of_range = half_warp_reduce(temp_storage.half[t / 32]).Sum(rows_out_of_range);
-    __syncthreads();
-    if (!(t & 0xf)) { ctxtree[t >> 4] = rows_out_of_range; }
     __syncthreads();
-    if (t < 32) {
-      rows_out_of_range = warp_reduce(temp_storage.full).Sum(static_cast<uint32_t>(ctxtree[t]));
-      if (t == 0) { row_ctx[blockIdx.x] = rows_out_of_range; }
-    }
+    // Return the number of rows out of range
+    rows_out_of_range = block_reduce(temp_storage.bk_storage).Sum(rows_out_of_range);
+    if (t == 0) { row_ctx[blockIdx.x] = rows_out_of_range; }
   } else {
     // Just store the row counts and output contexts
-    if (t == 0) { row_ctx[blockIdx.x] = ctxtree[1]; }
+    if (t == 0) { row_ctx[blockIdx.x] = temp_storage.ctxtree[1]; }
   }
 }
 

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
@@ -62,13 +62,17 @@ static inline __device__ uint32_t nvstr_init_hash(char const *ptr, uint32_t len)
  *
  * @param[in,out] s dictionary builder state
  * @param[in] t thread id
+ * @param[in] temp_storage shared memory storage to scan non-null positions
  */
-static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t)
+template <int block_size, typename Storage>
+static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
+                                          int t,
+                                          Storage &temp_storage)
 {
   if (t == 0) { s->nnz = 0; }
   for (uint32_t i = 0; i < s->chunk.num_rows; i += 512) {
     const uint32_t *valid_map = s->chunk.valid_map_base;
-    uint32_t is_valid, nz_map, nz_pos;
+    uint32_t is_valid, nz_pos;
     if (t < 16) {
       if (!valid_map) {
         s->scratch_red[t] = 0xffffffffu;
@@ -88,18 +92,13 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t)
     }
     __syncthreads();
     is_valid = (i + t < s->chunk.num_rows) ? (s->scratch_red[t >> 5] >> (t & 0x1f)) & 1 : 0;
-    nz_map   = ballot(is_valid);
-    nz_pos   = s->nnz + __popc(nz_map & (0x7fffffffu >> (0x1fu - ((uint32_t)t & 0x1f))));
-    if (!(t & 0x1f)) { s->scratch_red[16 + (t >> 5)] = __popc(nz_map); }
+    uint32_t tmp_nnz;
+    cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>(temp_storage)
+      .ExclusiveSum(is_valid, nz_pos, tmp_nnz);
+    nz_pos += s->nnz;
     __syncthreads();
-    if (t < 32) {
-      uint32_t nnz     = s->scratch_red[16 + (t & 0xf)];
-      uint32_t nnz_pos = WarpReducePos16(nnz, t);
-      if (t == 0xf) { s->nnz += nnz_pos; }
-      if (t <= 0xf) { s->scratch_red[t] = nnz_pos - nnz; }
-    }
-    __syncthreads();
-    if (is_valid) { s->dict[nz_pos + s->scratch_red[t >> 5]] = i + t; }
+    if (!t) { s->nnz += tmp_nnz; }
+    if (is_valid) { s->dict[nz_pos] = i + t; }
     __syncthreads();
   }
 }
@@ -116,11 +115,13 @@ __global__ void __launch_bounds__(block_size, 2)
   gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
 {
   __shared__ __align__(16) dictinit_state_s state_g;
-  using warp_reduce      = cub::WarpReduce<uint32_t>;
-  using half_warp_reduce = cub::WarpReduce<uint32_t, 16>;
+
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
+
   __shared__ union {
-    typename warp_reduce::TempStorage full[block_size / 32];
-    typename half_warp_reduce::TempStorage half[block_size / 32];
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
   } temp_storage;
 
   dictinit_state_s *const s = &state_g;
@@ -138,7 +139,7 @@ __global__ void __launch_bounds__(block_size, 2)
   __syncthreads();
   // First, take care of NULLs, and count how many strings we have (TODO: bypass this step when
   // there are no nulls)
-  LoadNonNullIndices(s, t);
+  LoadNonNullIndices<block_size>(s, t, temp_storage.scan_storage);
   // Sum the lengths of all the strings
   if (t == 0) {
     s->chunk.string_char_count = 0;
@@ -157,13 +158,8 @@ __global__ void __launch_bounds__(block_size, 2)
       len    = static_cast<uint32_t>(ck_data[ck_row].count);
       hash   = nvstr_init_hash(ck_data[ck_row].ptr, len);
     }
-    len = half_warp_reduce(temp_storage.half[t / 32]).Sum(len);
-    if (!(t & 0xf)) { s->scratch_red[t >> 4] = len; }
-    __syncthreads();
-    if (t < 32) {
-      len = warp_reduce(temp_storage.full[t / 32]).Sum(s->scratch_red[t]);
-      if (t == 0) s->chunk.string_char_count += len;
-    }
+    len = block_reduce(temp_storage.reduce_storage).Sum(len);
+    if (t == 0) s->chunk.string_char_count += len;
     if (i + t < nnz) {
       atomicAdd(&s->map.u32[hash >> 1], 1 << ((hash & 1) ? 16 : 0));
       dict_data[i + t] = start_row + ck_row;
@@ -182,21 +178,13 @@ __global__ void __launch_bounds__(block_size, 2)
     uint32_t sum23   = count23 + (count23 << 16);
     uint32_t sum45   = count45 + (count45 << 16);
     uint32_t sum67   = count67 + (count67 << 16);
-    uint32_t sum_w, tmp;
     sum23 += (sum01 >> 16) * 0x10001;
     sum45 += (sum23 >> 16) * 0x10001;
     sum67 += (sum45 >> 16) * 0x10001;
-    sum_w = sum67 >> 16;
-    sum_w = WarpReducePos16(sum_w, t);
-    if ((t & 0xf) == 0xf) { s->scratch_red[t >> 4] = sum_w; }
-    __syncthreads();
-    if (t < 32) {
-      uint32_t sum_b    = WarpReducePos32(s->scratch_red[t], t);
-      s->scratch_red[t] = sum_b;
-    }
+    uint32_t sum_w = sum67 >> 16;
+    block_scan(temp_storage.scan_storage).InclusiveSum(sum_w, sum_w);
     __syncthreads();
-    tmp                   = (t >= 16) ? s->scratch_red[(t >> 4) - 1] : 0;
-    sum_w                 = (sum_w - (sum67 >> 16) + tmp) * 0x10001;
+    sum_w                 = (sum_w - (sum67 >> 16)) * 0x10001;
     s->map.u32[t * 4 + 0] = sum_w + sum01 - count01;
     s->map.u32[t * 4 + 1] = sum_w + sum23 - count23;
     s->map.u32[t * 4 + 2] = sum_w + sum45 - count45;
@@ -239,7 +227,7 @@ __global__ void __launch_bounds__(block_size, 2)
   // map, the position of the first string can be inferred from the hash map counts
   dict_char_count = 0;
   for (uint32_t i = 0; i < nnz; i += block_size) {
-    uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0, dupe_mask, dupes_before;
+    uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0;
     if (i + t < nnz) {
       const char *str1, *str2;
       uint32_t len1, len2, hash;
@@ -255,33 +243,23 @@ __global__ void __launch_bounds__(block_size, 2)
         dict_char_count += (is_dupe) ? 0 : len1;
       }
     }
-    dupe_mask    = ballot(is_dupe);
-    dupes_before = s->total_dupes + __popc(dupe_mask & ((2 << (t & 0x1f)) - 1));
-    if (!(t & 0x1f)) { s->scratch_red[t >> 5] = __popc(dupe_mask); }
-    __syncthreads();
-    if (t < 32) {
-      uint32_t warp_dupes = (t < 16) ? s->scratch_red[t] : 0;
-      uint32_t warp_pos   = WarpReducePos16(warp_dupes, t);
-      if (t == 0xf) { s->total_dupes += warp_pos; }
-      if (t < 16) { s->scratch_red[t] = warp_pos - warp_dupes; }
-    }
+    uint32_t dupes_in_block;
+    uint32_t dupes_before;
+    block_scan(temp_storage.scan_storage).InclusiveSum(is_dupe, dupes_before, dupes_in_block);
+    dupes_before += s->total_dupes;
     __syncthreads();
+    if (!t) { s->total_dupes += dupes_in_block; }
     if (i + t < nnz) {
       if (!is_dupe) {
-        dupes_before += s->scratch_red[t >> 5];
         dict_data[i + t - dupes_before] = ck_row + start_row;
       } else {
         s->chunk.dict_index[ck_row + start_row] = (ck_row_ref + start_row) | (1u << 31);
       }
     }
   }
-  dict_char_count = warp_reduce(temp_storage.full[t / 32]).Sum(dict_char_count);
-  if (!(t & 0x1f)) { s->scratch_red[t >> 5] = dict_char_count; }
-  __syncthreads();
-  if (t < 32) {
-    dict_char_count =
-      half_warp_reduce(temp_storage.half[t / 32]).Sum((t < 16) ? s->scratch_red[t] : 0);
-  }
+  // temp_storage is being used twice, so make sure there is `__syncthreads()` between them
+  // while making any future changes.
+  dict_char_count = block_reduce(temp_storage.reduce_storage).Sum(dict_char_count);
   if (!t) {
     chunks[group_id * num_columns + col_id].num_strings       = nnz;
     chunks[group_id * num_columns + col_id].string_char_count = s->chunk.string_char_count;
@@ -362,8 +340,12 @@ __global__ void __launch_bounds__(block_size)
   gpuBuildStripeDictionaries(StripeDictionary *stripes, uint32_t num_columns)
 {
   __shared__ __align__(16) build_state_s state_g;
-  using warp_reduce = cub::WarpReduce<uint32_t>;
-  __shared__ typename warp_reduce::TempStorage temp_storage[block_size / 32];
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
 
   build_state_s *const s = &state_g;
   uint32_t col_id        = blockIdx.x;
@@ -384,8 +366,8 @@ __global__ void __launch_bounds__(block_size)
   str_data        = static_cast<const nvstrdesc_s *>(s->stripe.column_data_base);
   dict_char_count = 0;
   for (uint32_t i = 0; i < num_strings; i += block_size) {
-    uint32_t cur = (i + t < num_strings) ? dict_data[i + t] : 0;
-    uint32_t dupe_mask, dupes_before, cur_len = 0;
+    uint32_t cur     = (i + t < num_strings) ? dict_data[i + t] : 0;
+    uint32_t cur_len = 0;
     const char *cur_ptr;
     bool is_dupe = false;
     if (i + t < num_strings) {
@@ -397,28 +379,19 @@ __global__ void __launch_bounds__(block_size)
       is_dupe       = nvstr_is_equal(cur_ptr, cur_len, str_data[prev].ptr, str_data[prev].count);
     }
     dict_char_count += (is_dupe) ? 0 : cur_len;
-    dupe_mask    = ballot(is_dupe);
-    dupes_before = s->total_dupes + __popc(dupe_mask & ((2 << (t & 0x1f)) - 1));
-    if (!(t & 0x1f)) { s->scratch_red[t >> 5] = __popc(dupe_mask); }
-    __syncthreads();
-    if (t < 32) {
-      uint32_t warp_dupes = s->scratch_red[t];
-      uint32_t warp_pos   = WarpReducePos32(warp_dupes, t);
-      if (t == 0x1f) { s->total_dupes += warp_pos; }
-      s->scratch_red[t] = warp_pos - warp_dupes;
-    }
+    uint32_t dupes_in_block;
+    uint32_t dupes_before;
+    block_scan(temp_storage.scan_storage).InclusiveSum(is_dupe, dupes_before, dupes_in_block);
+    dupes_before += s->total_dupes;
     __syncthreads();
+    if (!t) { s->total_dupes += dupes_in_block; }
     if (i + t < num_strings) {
-      dupes_before += s->scratch_red[t >> 5];
       dict_index[cur] = i + t - dupes_before;
       if (!is_dupe && dupes_before != 0) { dict_data[i + t - dupes_before] = cur; }
     }
     __syncthreads();
   }
-  dict_char_count = warp_reduce(temp_storage[t / 32]).Sum(dict_char_count);
-  if (!(t & 0x1f)) { s->scratch_red[t >> 5] = dict_char_count; }
-  __syncthreads();
-  if (t < 32) { dict_char_count = warp_reduce(temp_storage[t / 32]).Sum(s->scratch_red[t]); }
+  dict_char_count = block_reduce(temp_storage.reduce_storage).Sum(dict_char_count);
   if (t == 0) {
     stripes[stripe_id * num_columns + col_id].num_strings     = num_strings - s->total_dupes;
     stripes[stripe_id * num_columns + col_id].dict_char_count = dict_char_count;

diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
@@ -66,8 +66,7 @@ __global__ void __launch_bounds__(init_threads_per_block)
  * @param[in] statistics_count Number of statistics buffers
  */
 constexpr unsigned int buffersize_reduction_dim = 32;
-constexpr unsigned int buffersize_threads_per_block =
-  buffersize_reduction_dim * buffersize_reduction_dim;
+constexpr unsigned int block_size        = buffersize_reduction_dim * buffersize_reduction_dim;
 constexpr unsigned int pb_fld_hdrlen     = 1;
 constexpr unsigned int pb_fld_hdrlen16   = 2;  // > 127-byte length
 constexpr unsigned int pb_fld_hdrlen32   = 5;  // > 16KB length
@@ -77,19 +76,18 @@ constexpr unsigned int pb_fldlen_decimal = 40;  // Assume decimal2string fits in
 constexpr unsigned int pb_fldlen_bucket1 = 1 + pb_fldlen_int64;
 constexpr unsigned int pb_fldlen_common  = 2 * pb_fld_hdrlen + pb_fldlen_int64;
 
-__global__ void __launch_bounds__(buffersize_threads_per_block, 1)
+template <unsigned int block_size>
+__global__ void __launch_bounds__(block_size, 1)
   gpu_init_statistics_buffersize(statistics_merge_group *groups,
                                  const statistics_chunk *chunks,
                                  uint32_t statistics_count)
 {
-  __shared__ volatile uint32_t scratch_red[buffersize_reduction_dim];
-  __shared__ volatile uint32_t stats_size;
-  uint32_t tx = threadIdx.x;
-  uint32_t ty = threadIdx.y;
-  uint32_t t  = ty * buffersize_reduction_dim + tx;
-  if (!t) { stats_size = 0; }
+  using block_scan = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
+  __shared__ typename block_scan::TempStorage temp_storage;
+  volatile uint32_t stats_size = 0;
+  uint32_t t                   = threadIdx.x;
   __syncthreads();
-  for (uint32_t start = 0; start < statistics_count; start += buffersize_threads_per_block) {
+  for (uint32_t start = 0; start < statistics_count; start += block_size) {
     uint32_t stats_len = 0, stats_pos;
     uint32_t idx       = start + t;
     if (idx < statistics_count) {
@@ -120,19 +118,15 @@ __global__ void __launch_bounds__(buffersize_threads_per_block, 1)
         default: break;
       }
     }
-    stats_pos = WarpReducePos32(stats_len, tx);
-    if (tx == buffersize_reduction_dim - 1) { scratch_red[ty] = stats_pos; }
-    __syncthreads();
-    if (ty == 0) { scratch_red[tx] = WarpReducePos32(scratch_red[tx], tx); }
-    __syncthreads();
-    if (ty != 0) { stats_pos += scratch_red[ty - 1]; }
+    uint32_t tmp_stats_size;
+    block_scan(temp_storage).ExclusiveSum(stats_len, stats_pos, tmp_stats_size);
     stats_pos += stats_size;
+    stats_size += tmp_stats_size;
     if (idx < statistics_count) {
-      groups[idx].start_chunk = stats_pos - stats_len;
+      groups[idx].start_chunk = stats_pos;
       groups[idx].num_chunks  = stats_len;
     }
     __syncthreads();
-    if (t == buffersize_threads_per_block - 1) { stats_size = stats_pos; }
   }
 }
 
@@ -405,9 +399,8 @@ void orc_init_statistics_buffersize(statistics_merge_group *groups,
                                     uint32_t statistics_count,
                                     rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(buffersize_reduction_dim, buffersize_reduction_dim);
-  gpu_init_statistics_buffersize<<<1, dim_block, 0, stream.value()>>>(
-    groups, chunks, statistics_count);
+  gpu_init_statistics_buffersize<block_size>
+    <<<1, block_size, 0, stream.value()>>>(groups, chunks, statistics_count);
 }
 
 /**