Skip to content

Commit

Permalink
Add patch for thrust-cub 1.16 to fix sort compile times (rapidsai#10577)
Browse files Browse the repository at this point in the history
Fixes `thrust.patch` to patch the CUB source for `sort` to minimize the inlining of the comparator functor. The build was updated in rapidsai#10489 to thrust-1.16 which includes change to thrust sort using CUB's `DeviceMergeSort`. This means the previous patch does not apply to the new thrust/cub source. This dramatically increased the build for `sort.cu` and other related source files as can be seen in this Build Metrics Report from rapidsai#10489: https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-cpu-cuda-build/CUDA=11.5/8633/Build_20Metrics_20Report/

This PR moves the `pragma unroll` changes into the appropriate CUB source files reducing the build time back to the previous levels (or close to it I hope).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: rapidsai#10577
  • Loading branch information
davidwendt authored and abellina committed Apr 11, 2022
1 parent adec535 commit aa03473
Showing 1 changed file with 53 additions and 49 deletions.
102 changes: 53 additions & 49 deletions cpp/cmake/thrust.patch
Original file line number Diff line number Diff line change
@@ -1,52 +1,39 @@
diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h
index 1ffeef0..5e80800 100644
--- a/thrust/system/cuda/detail/sort.h
+++ b/thrust/system/cuda/detail/sort.h
@@ -108,7 +108,7 @@ namespace __merge_sort {
key_type key2 = keys_shared[keys2_beg];

diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh
index 4769df36..d86d6342 100644
--- a/cub/block/block_merge_sort.cuh
+++ b/cub/block/block_merge_sort.cuh
@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
KeyT key1 = keys_shared[keys1_beg];
KeyT key2 = keys_shared[keys2_beg];

-#pragma unroll
+#pragma unroll 1
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
bool p = (keys2_beg < keys2_end) &&
@@ -311,10 +311,10 @@ namespace __merge_sort {
void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD],
item_type (&items)[ITEMS_PER_THREAD])
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
{
bool p = (keys2_beg < keys2_end) &&
@@ -383,7 +383,7 @@ public:
//
KeyT max_key = oob_default;

- #pragma unroll
+ #pragma unroll 1
for (int item = 1; item < ITEMS_PER_THREAD; ++item)
{
-#pragma unroll
+#pragma unroll 1
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
-#pragma unroll
+#pragma unroll 1
for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
{
if (compare_op(keys[j + 1], keys[j]))
@@ -350,7 +350,7 @@ namespace __merge_sort {
// each thread has sorted keys_loc
// merge sort keys_loc in shared memory
//
-#pragma unroll
+#pragma unroll 1
for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
{
sync_threadblock();
@@ -479,7 +479,7 @@ namespace __merge_sort {
// and fill the remainig keys with it
//
key_type max_key = keys_loc[0];
-#pragma unroll
+#pragma unroll 1
for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
diff a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index 41eb1d2..f2893b4 100644
if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
@@ -407,7 +407,7 @@ public:
// each thread has sorted keys
// merge sort keys in shared memory
//
- #pragma unroll
+ #pragma unroll 1
for (int target_merged_threads_number = 2;
target_merged_threads_number <= NUM_THREADS;
target_merged_threads_number *= 2)
diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index b188c75f..3f36656f 100644
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -723,7 +723,7 @@ struct DeviceRadixSortPolicy
@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy


/// SM60 (GP100)
Expand All @@ -55,11 +42,11 @@ index 41eb1d2..f2893b4 100644
{
enum {
PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100)
diff a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index f6aee45..dd64301 100644
diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index e0470ccb..6a0c2ed6 100644
--- a/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/device/dispatch/dispatch_reduce.cuh
@@ -284,7 +284,7 @@ struct DeviceReducePolicy
@@ -280,7 +280,7 @@ struct DeviceReducePolicy
};

/// SM60
Expand All @@ -68,11 +55,11 @@ index f6aee45..dd64301 100644
{
// ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
typedef AgentReducePolicy<
diff a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
index c0c6d59..937ee31 100644
diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
index c2d04588..ac2d10e0 100644
--- a/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/device/dispatch/dispatch_scan.cuh
@@ -178,7 +178,7 @@ struct DeviceScanPolicy
@@ -177,7 +177,7 @@ struct DeviceScanPolicy
};

/// SM600
Expand All @@ -81,3 +68,20 @@ index c0c6d59..937ee31 100644
{
typedef AgentScanPolicy<
128, 15, ///< Threads per block, items per thread
diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh
index 5d486789..b42fb5f0 100644
--- a/cub/thread/thread_sort.cuh
+++ b/cub/thread/thread_sort.cuh
@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
{
constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;

- #pragma unroll
+ #pragma unroll 1
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
{
- #pragma unroll
+ #pragma unroll 1
for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
{
if (compare_op(keys[j + 1], keys[j]))

0 comments on commit aa03473

Please sign in to comment.