From 2b080665ec409cd72a4e30fb7daa15b1bce30b0b Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 11 Aug 2023 20:38:48 +0000
Subject: [PATCH] Fix CubDebug

---
 cub/cub/agent/single_pass_scan_operators.cuh  |   8 +-
 .../dispatch/dispatch_adjacent_difference.cuh |  28 +-
 .../device/dispatch/dispatch_batch_memcpy.cuh |  63 ++--
 .../device/dispatch/dispatch_histogram.cuh    |  67 ++--
 .../device/dispatch/dispatch_merge_sort.cuh   |  48 +--
 .../device/dispatch/dispatch_radix_sort.cuh   | 296 +++++++++++++-----
 cub/cub/device/dispatch/dispatch_reduce.cuh   |  71 +++--
 .../dispatch/dispatch_reduce_by_key.cuh       |  47 +--
 cub/cub/device/dispatch/dispatch_rle.cuh      |  49 +--
 cub/cub/device/dispatch/dispatch_scan.cuh     |  52 +--
 .../device/dispatch/dispatch_scan_by_key.cuh  |  53 ++--
 .../dispatch/dispatch_segmented_sort.cuh      |  76 ++---
 .../device/dispatch/dispatch_select_if.cuh    |  37 ++-
 .../dispatch/dispatch_three_way_partition.cuh |  48 +--
 .../dispatch/dispatch_unique_by_key.cuh       |  81 +++--
 cub/cub/grid/grid_barrier.cuh                 |  21 +-
 cub/cub/util_allocator.cuh                    | 129 ++++++--
 cub/cub/util_debug.cuh                        |  11 +-
 cub/cub/util_device.cuh                       |  29 +-
 cub/docs/developer_overview.rst               |   6 +-
 cub/test/catch2_test_debug.cu                 |  37 +++
 21 files changed, 843 insertions(+), 414 deletions(-)
 create mode 100644 cub/test/catch2_test_debug.cu
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index 7dceb902758..342e859246c 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -701,7 +701,13 @@ struct ScanTileState<T, false>
             allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
 
             // Compute allocation pointers into the single storage blob
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            error = CubDebug(
+              AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+
+            if (cudaSuccess != error) 
+            {
+              break;
+            }
 
             // Alias the offsets
             d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 39d58de2429..25f5c25e7f4 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -221,10 +221,10 @@ struct DispatchAdjacentDifference : public SelectedPolicy
       void *allocations[1]            = {nullptr};
       std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size};
 
-      if (CubDebug(error = AliasTemporaries(d_temp_storage,
-                                            temp_storage_bytes,
-                                            allocations,
-                                            allocation_sizes)))
+      error = CubDebug(
+        AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -278,15 +278,16 @@ struct DispatchAdjacentDifference : public SelectedPolicy
                 num_tiles,
                 tile_size);
 
-        error = detail::DebugSyncStream(stream);
+        error = CubDebug(detail::DebugSyncStream(stream));
 
-        if (CubDebug(error))
+        if (cudaSuccess != error)
         {
           break;
         }
 
         // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -319,15 +320,16 @@ struct DispatchAdjacentDifference : public SelectedPolicy
               difference_op,
               num_items);
 
-      error = detail::DebugSyncStream(stream);
+      error = CubDebug(detail::DebugSyncStream(stream));
       
-      if (CubDebug(error))
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -352,7 +354,8 @@ struct DispatchAdjacentDifference : public SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -367,7 +370,8 @@ struct DispatchAdjacentDifference : public SelectedPolicy
                                           stream);
 
       // Dispatch to chained policy
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index a8b5a1f6550..4dfddce59e2 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -470,13 +470,14 @@ struct DispatchBatchMemcpy : SelectedPolicy
 
     std::size_t buffer_offset_scan_storage = 0;
     std::size_t blev_block_scan_storage    = 0;
-    CubDebug(error = BLevBufferOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
+    error = CubDebug(BLevBufferOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
                                                                buffer_offset_scan_storage));
     if (error)
     {
       return error;
     }
-    CubDebug(error = BLevBlockOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
+
+    error = CubDebug(BLevBlockOffsetTileState::AllocationSize(static_cast<int32_t>(num_tiles),
                                                               blev_block_scan_storage));
     if (error)
     {
@@ -504,8 +505,8 @@ struct DispatchBatchMemcpy : SelectedPolicy
     }
 
     // Alias memory buffers into the storage blob
-    if (CubDebug(
-          error = temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes)))
+    error = CubDebug(temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes));
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -551,25 +552,26 @@ struct DispatchBatchMemcpy : SelectedPolicy
 
     // Get device ordinal
     int device_ordinal;
-    if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+    error = CubDebug(cudaGetDevice(&device_ordinal));
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Get SM count
     int sm_count;
-    if (CubDebug(error = cudaDeviceGetAttribute(&sm_count,
-                                                cudaDevAttrMultiProcessorCount,
-                                                device_ordinal)))
+    error =
+      CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Get SM occupancy for the batch memcpy block-level buffers kernel
     int batch_memcpy_blev_occupancy;
-    if (CubDebug(error = MaxSmOccupancy(batch_memcpy_blev_occupancy,
-                                        multi_block_memcpy_kernel,
-                                        BLEV_BLOCK_THREADS)))
+    error = CubDebug(
+      MaxSmOccupancy(batch_memcpy_blev_occupancy, multi_block_memcpy_kernel, BLEV_BLOCK_THREADS));
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -579,18 +581,20 @@ struct DispatchBatchMemcpy : SelectedPolicy
 
     // Construct the tile status for the buffer prefix sum
     BLevBufferOffsetTileState buffer_scan_tile_state;
-    if (CubDebug(error = buffer_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
-                                                     blev_buffer_scan_alloc.get(),
-                                                     buffer_offset_scan_storage)))
+    error = CubDebug(buffer_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
+                                                 blev_buffer_scan_alloc.get(),
+                                                 buffer_offset_scan_storage));
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Construct the tile status for thread blocks-to-buffer-assignment prefix sum
     BLevBlockOffsetTileState block_scan_tile_state;
-    if (CubDebug(error = block_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
-                                                    blev_block_scan_alloc.get(),
-                                                    blev_block_scan_storage)))
+    error = CubDebug(block_scan_tile_state.Init(static_cast<int32_t>(num_tiles),
+                                                blev_block_scan_alloc.get(),
+                                                blev_block_scan_storage));
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -612,16 +616,17 @@ struct DispatchBatchMemcpy : SelectedPolicy
         .doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles);
 
     // Check for failure to launch
-    if (CubDebug(error))
+    error = CubDebug(error);
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Sync the stream if specified to flush runtime errors
-    error = detail::DebugSyncStream(stream);
+    error = CubDebug(detail::DebugSyncStream(stream));
 
     // Check for failure to launch
-    if (CubDebug(error))
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -654,14 +659,15 @@ struct DispatchBatchMemcpy : SelectedPolicy
                     block_scan_tile_state);
 
     // Check for failure to launch
-    if (CubDebug(error))
+    error = CubDebug(error);
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Sync the stream if specified to flush runtime errors
-    error = detail::DebugSyncStream(stream);
-    if (CubDebug(error))
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -687,13 +693,14 @@ struct DispatchBatchMemcpy : SelectedPolicy
                     batch_memcpy_grid_size - 1);
 
     // Check for failure to launch
-    if (CubDebug(error))
+    error = CubDebug(error);
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Sync the stream if specified to flush runtime errors
-    error = detail::DebugSyncStream(stream);
+    error = CubDebug(detail::DebugSyncStream(stream));
 
     return error;
   }
@@ -718,7 +725,8 @@ struct DispatchBatchMemcpy : SelectedPolicy
 
     // Get PTX version
     int ptx_version = 0;
-    if (CubDebug(error = PtxVersion(ptx_version)))
+    error = CubDebug(PtxVersion(ptx_version));
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -733,7 +741,8 @@ struct DispatchBatchMemcpy : SelectedPolicy
                                  stream);
 
     // Dispatch to chained policy
-    if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+    error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+    if (cudaSuccess != error)
     {
       return error;
     }
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 4621be04003..b0b8d6fa879 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -315,25 +315,27 @@ struct dispatch_histogram
     {
       // Get device ordinal
       int device_ordinal;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get SM count
       int sm_count;
-      if (CubDebug(error = cudaDeviceGetAttribute(&sm_count,
-                                                  cudaDevAttrMultiProcessorCount,
-                                                  device_ordinal)))
+      error =
+        CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get SM occupancy for histogram_sweep_kernel
       int histogram_sweep_sm_occupancy;
-      if (CubDebug(error = MaxSmOccupancy(histogram_sweep_sm_occupancy,
-                                          histogram_sweep_kernel,
-                                          block_threads)))
+      error = CubDebug(
+        MaxSmOccupancy(histogram_sweep_sm_occupancy, histogram_sweep_kernel, block_threads));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -378,9 +380,9 @@ struct dispatch_histogram
 
       // Alias the temporary allocations from the single storage blob (or compute the
       // necessary size of the blob)
-      if (CubDebug(
-            error =
-              AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+      error = CubDebug(
+        AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -504,14 +506,15 @@ struct dispatch_histogram
               tile_queue);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -958,7 +961,8 @@ public:
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1012,7 +1016,8 @@ public:
                    row_stride_samples,
                    stream);
 
-        if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+        error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -1045,7 +1050,8 @@ public:
                    row_stride_samples,
                    stream);
 
-        if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+        error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -1152,7 +1158,8 @@ public:
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1205,7 +1212,8 @@ public:
                  row_stride_samples,
                  stream);
 
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1313,7 +1321,8 @@ public:
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1330,10 +1339,10 @@ public:
 
       for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
       {
-        error = privatized_decode_op[channel].Init(num_output_levels[channel],
-                                                   upper_level[channel],
-                                                   lower_level[channel]);
-        if (CubDebug(error != cudaSuccess))
+        error = CubDebug(privatized_decode_op[channel].Init(num_output_levels[channel],
+                                                            upper_level[channel],
+                                                            lower_level[channel]));
+        if (error != cudaSuccess)
         {
           // Make sure to also return a reasonable value for `temp_storage_bytes` in case of
           // an overflow of the bin computation, in which case a subsequent algorithm
@@ -1380,7 +1389,8 @@ public:
                    row_stride_samples,
                    stream);
 
-        if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+        error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -1413,7 +1423,8 @@ public:
                    row_stride_samples,
                    stream);
 
-        if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+        error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -1525,7 +1536,8 @@ public:
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1581,7 +1593,8 @@ public:
                  row_stride_samples,
                  stream);
 
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 5bbd3dfdd78..7b6c73a1ddf 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -570,7 +570,8 @@ struct DispatchMergeSort : SelectedPolicy
     {
       // Get device ordinal
       int device_ordinal = 0;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -630,10 +631,11 @@ struct DispatchMergeSort : SelectedPolicy
       if (runtime_shmem_size_check_is_required)
       {
         int max_shmem = 0;
-        if (CubDebug(
-              error = cudaDeviceGetAttribute(&max_shmem,
-                                             cudaDevAttrMaxSharedMemoryPerBlock,
-                                             device_ordinal)))
+
+        error = CubDebug(
+          cudaDeviceGetAttribute(&max_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_ordinal));
+
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -656,10 +658,9 @@ struct DispatchMergeSort : SelectedPolicy
                                          temporary_values_storage_size,
                                          virtual_shared_memory_size};
 
-      if (CubDebug(error = AliasTemporaries(d_temp_storage,
-                                            temp_storage_bytes,
-                                            allocations,
-                                            allocation_sizes)))
+      error = CubDebug(
+        AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -726,14 +727,15 @@ struct DispatchMergeSort : SelectedPolicy
 
       block_sort_launcher.launch();
 
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -789,14 +791,15 @@ struct DispatchMergeSort : SelectedPolicy
                 target_merged_tiles_number,
                 tile_size);
 
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
         {
           break;
         }
 
         // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -804,14 +807,15 @@ struct DispatchMergeSort : SelectedPolicy
         // Merge
         merge_launcher.launch(ping, target_merged_tiles_number);
 
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
         {
           break;
         }
 
         // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -840,7 +844,10 @@ struct DispatchMergeSort : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+
+      error = CubDebug(PtxVersion(ptx_version));
+
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -858,7 +865,8 @@ struct DispatchMergeSort : SelectedPolicy
                                  ptx_version);
 
       // Dispatch to chained policy
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 84b6ccffd19..9a75e6fe0bb 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1303,14 +1303,15 @@ struct DispatchRadixSort : SelectedPolicy
                 decomposer);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError()))
+            error = CubDebug(cudaPeekAtLastError());
+            if (cudaSuccess != error)
             {
                 break;
             }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -1373,14 +1374,15 @@ struct DispatchRadixSort : SelectedPolicy
                 decomposer);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError()))
+            error = CubDebug(cudaPeekAtLastError());
+            if (cudaSuccess != error)
             {
                 break;
             }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -1399,14 +1401,15 @@ struct DispatchRadixSort : SelectedPolicy
                 pass_spine_length);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError()))
+            error = CubDebug(cudaPeekAtLastError());
+            if (cudaSuccess != error)
             {
                 break;
             }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -1435,14 +1438,15 @@ struct DispatchRadixSort : SelectedPolicy
                 decomposer);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError()))
+            error = CubDebug(cudaPeekAtLastError());
+            if (cudaSuccess != error)
             {
                 break;
             }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -1498,9 +1502,23 @@ struct DispatchRadixSort : SelectedPolicy
                 radix_bits              = DownsweepPolicyT::RADIX_BITS;
                 radix_digits            = 1 << radix_bits;
 
-                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
-                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
-                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
+                error = CubDebug(upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel));
+                if (cudaSuccess != error) 
+                {
+                    break;
+                }
+
+                error = CubDebug(scan_config.Init<ScanPolicyT>(scan_kernel));
+                if (cudaSuccess != error) 
+                {
+                    break;
+                }
+
+                error = CubDebug(downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel));
+                if (cudaSuccess != error) 
+                {
+                    break;
+                }
 
                 max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0);
 
@@ -1572,24 +1590,49 @@ struct DispatchRadixSort : SelectedPolicy
 
         do {
             // initialization
-            if (CubDebug(error = cudaMemsetAsync(
-                   d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream))) break;
+            error = CubDebug(
+              cudaMemsetAsync(d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream));
+            if (cudaSuccess != error)
+            {
+                break;
+            }
 
             // compute num_passes histograms with RADIX_DIGITS bins each
-            if (CubDebug(error = cudaMemsetAsync
-                   (d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream))) break;
+            error = CubDebug(
+              cudaMemsetAsync(d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream));
+            if (cudaSuccess != error)
+            {
+                break;
+            }
             int device = -1;
             int num_sms = 0;
-            if (CubDebug(error = cudaGetDevice(&device))) break;
-            if (CubDebug(error = cudaDeviceGetAttribute(
-                   &num_sms, cudaDevAttrMultiProcessorCount, device))) break;
+
+            error = CubDebug(cudaGetDevice(&device));
+            if (cudaSuccess != error)
+            {
+                break;
+            }
+
+            error =
+              CubDebug(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+            if (cudaSuccess != error)
+            {
+                break;
+            }
 
             const int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS;
             int histo_blocks_per_sm = 1;
             auto histogram_kernel = DeviceRadixSortHistogramKernel<
                 MaxPolicyT, IS_DESCENDING, KeyT, OffsetT, DecomposerT>;
-            if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-                &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break;
+
+            error = CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm,
+                                                                           histogram_kernel,
+                                                                           HISTO_BLOCK_THREADS,
+                                                                           0));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // log histogram_kernel configuration
             #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
@@ -1605,13 +1648,14 @@ struct DispatchRadixSort : SelectedPolicy
               histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream
             ).doit(histogram_kernel,
                    d_bins, d_keys.Current(), num_items, begin_bit, end_bit, decomposer);
-            if (CubDebug(error))
+            error = CubDebug(error);
+            if (cudaSuccess != error)
             {
                 break;
             }
 
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -1630,13 +1674,14 @@ struct DispatchRadixSort : SelectedPolicy
                       num_passes, SCAN_BLOCK_THREADS, 0, stream
                       ).doit(DeviceRadixSortExclusiveSumKernel<MaxPolicyT, OffsetT>,
                             d_bins);
-            if (CubDebug(error))
+            error = CubDebug(error);
+            if (cudaSuccess != error)
             {
                 break;
             }
 
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -1657,14 +1702,21 @@ struct DispatchRadixSort : SelectedPolicy
                 for (OffsetT portion = 0; portion < num_portions; ++portion)
                 {
                     PortionOffsetT portion_num_items =
-                      static_cast<PortionOffsetT>(
-                        CUB_MIN(num_items - portion * PORTION_SIZE,
-                                static_cast<OffsetT>(PORTION_SIZE)));
-                    PortionOffsetT num_blocks =
-                        cub::DivideAndRoundUp(portion_num_items, ONESWEEP_TILE_ITEMS);
-                    if (CubDebug(error = cudaMemsetAsync(
-                           d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
-                           stream))) break;
+                      static_cast<PortionOffsetT>(CUB_MIN(num_items - portion * PORTION_SIZE,
+                                                          static_cast<OffsetT>(PORTION_SIZE)));
+
+                    PortionOffsetT num_blocks = cub::DivideAndRoundUp(portion_num_items,
+                                                                      ONESWEEP_TILE_ITEMS);
+
+                    error =
+                      CubDebug(cudaMemsetAsync(d_lookback,
+                                               0,
+                                               num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
+                                               stream));
+                    if (cudaSuccess != error) 
+                    {
+                        break;
+                    }
 
                     // log onesweep_kernel configuration
                     #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
@@ -1691,13 +1743,14 @@ struct DispatchRadixSort : SelectedPolicy
                            d_values.Current() + portion * PORTION_SIZE,
                            portion_num_items, current_bit, num_bits,
                            decomposer);
-                    if (CubDebug(error))
+                    error = CubDebug(error);
+                    if (cudaSuccess != error)
                     {
                       break;
                     }
 
-                    error = detail::DebugSyncStream(stream);
-                    if (CubDebug(error))
+                    error = CubDebug(detail::DebugSyncStream(stream));
+                    if (cudaSuccess != error)
                     {
                       break;
                     }
@@ -1745,11 +1798,20 @@ struct DispatchRadixSort : SelectedPolicy
         {
             // Get device ordinal
             int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+            error = CubDebug(cudaGetDevice(&device_ordinal));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Get SM count
             int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+            error = CubDebug(
+              cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Init regular and alternate-digit kernel configurations
             PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
@@ -1795,11 +1857,18 @@ struct DispatchRadixSort : SelectedPolicy
             };
 
             // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            error = CubDebug(
+              AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+            if (cudaSuccess != error)
+            {
+              break;
+            }
 
             // Return if the caller is simply requesting the size of the storage allocation
             if (d_temp_storage == NULL)
-                return cudaSuccess;
+            {
+              return cudaSuccess;
+            }
 
             // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
             int num_bits            = end_bit - begin_bit;
@@ -1821,20 +1890,37 @@ struct DispatchRadixSort : SelectedPolicy
 
             // Run first pass, consuming from the input's current buffers
             int current_bit = begin_bit;
-            if (CubDebug(error = InvokePass(
-                d_keys.Current(), d_keys_remaining_passes.Current(),
-                d_values.Current(), d_values_remaining_passes.Current(),
-                d_spine, spine_length, current_bit,
-                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+            error =
+              CubDebug(InvokePass(d_keys.Current(),
+                                  d_keys_remaining_passes.Current(),
+                                  d_values.Current(),
+                                  d_values_remaining_passes.Current(),
+                                  d_spine,
+                                  spine_length,
+                                  current_bit,
+                                  (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Run remaining passes
             while (current_bit < end_bit)
             {
-                if (CubDebug(error = InvokePass(
-                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_spine, spine_length, current_bit,
-                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
+                error = CubDebug(InvokePass(
+                  d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
+                  d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
+                  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                  d_spine,
+                  spine_length,
+                  current_bit,
+                  (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
+
+                if (cudaSuccess != error) 
+                {
+                    break;
+                }
 
                 // Invert selectors
                 d_keys_remaining_passes.selector ^= 1;
@@ -1898,13 +1984,19 @@ struct DispatchRadixSort : SelectedPolicy
                 (long long)stream);
         #endif
         cudaError_t error = cudaSuccess;
-        error = cudaMemcpyAsync(d_keys.Alternate(), d_keys.Current(), num_items * sizeof(KeyT),
-                                cudaMemcpyDefault, stream);
-        if (CubDebug(error))
+
+        error = CubDebug(cudaMemcpyAsync(d_keys.Alternate(),
+                                         d_keys.Current(),
+                                         num_items * sizeof(KeyT),
+                                         cudaMemcpyDefault,
+                                         stream));
+        if (cudaSuccess != error)
         {
             return error;
         }
-        if (CubDebug(error = detail::DebugSyncStream(stream)))
+
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
         {
             return error;
         }
@@ -1917,13 +2009,18 @@ struct DispatchRadixSort : SelectedPolicy
             _CubLog("Invoking async copy of %lld values on stream %lld\n",
                     (long long)num_items, (long long)stream);
             #endif
-            error = cudaMemcpyAsync(d_values.Alternate(), d_values.Current(),
-                                    num_items * sizeof(ValueT), cudaMemcpyDefault, stream);
-            if (CubDebug(error))
+            error = CubDebug(cudaMemcpyAsync(d_values.Alternate(),
+                                             d_values.Current(),
+                                             num_items * sizeof(ValueT),
+                                             cudaMemcpyDefault,
+                                             stream));
+            if (cudaSuccess != error)
             {
                 return error;
             }
-            if (CubDebug(error = detail::DebugSyncStream(stream)))
+
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
                 return error;
             }
@@ -2004,7 +2101,12 @@ struct DispatchRadixSort : SelectedPolicy
         do {
             // Get PTX version
             int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            error = CubDebug(PtxVersion(ptx_version));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Create dispatch functor
             DispatchRadixSort dispatch(d_temp_storage,
@@ -2020,8 +2122,11 @@ struct DispatchRadixSort : SelectedPolicy
                                        decomposer);
 
             // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-
+            error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+            if (cudaSuccess != error)
+            {
+                break;
+            }
         } while (0);
 
         return error;
@@ -2220,14 +2325,15 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
                 current_bit, pass_bits, decomposer);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError()))
+            error = CubDebug(cudaPeekAtLastError());
+            if (cudaSuccess != error)
             {
                 break;
             }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -2290,7 +2396,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
             };
 
             // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Return if the caller is simply requesting the size of the storage allocation
             if (d_temp_storage == NULL)
@@ -2320,20 +2430,32 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
             // Run first pass, consuming from the input's current buffers
             int current_bit = begin_bit;
 
-            if (CubDebug(error = InvokePass(
-                d_keys.Current(), d_keys_remaining_passes.Current(),
-                d_values.Current(), d_values_remaining_passes.Current(),
-                current_bit,
-                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+            error =
+              CubDebug(InvokePass(d_keys.Current(),
+                                  d_keys_remaining_passes.Current(),
+                                  d_values.Current(),
+                                  d_values_remaining_passes.Current(),
+                                  current_bit,
+                                  (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Run remaining passes
             while (current_bit < end_bit)
             {
-                if (CubDebug(error = InvokePass(
-                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    current_bit,
-                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+                error = CubDebug(InvokePass(
+                  d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
+                  d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],
+                  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                  current_bit,
+                  (current_bit < alt_end_bit) ? alt_pass_config : pass_config));
+                if (cudaSuccess != error) 
+                {
+                    break;
+                }
 
                 // Invert selectors and update current bit
                 d_keys_remaining_passes.selector ^= 1;
@@ -2409,7 +2531,12 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
         do {
             // Get PTX version
             int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            error = CubDebug(PtxVersion(ptx_version));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Create dispatch functor
             DispatchSegmentedRadixSort dispatch(
@@ -2420,8 +2547,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
                 stream, ptx_version);
 
             // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-
+            error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
         } while (0);
 
         return error;
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index e3402b6d8c9..698ce0552e8 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -676,14 +676,15 @@ struct DispatchReduce : SelectedPolicy
         .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -726,24 +727,25 @@ struct DispatchReduce : SelectedPolicy
     {
       // Get device ordinal
       int device_ordinal;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
         break;
+      }
 
       // Get SM count
       int sm_count;
-      if (CubDebug(
-            error = cudaDeviceGetAttribute(&sm_count,
-                                           cudaDevAttrMultiProcessorCount,
-                                           device_ordinal)))
+      error =
+        CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Init regular kernel configuration
       KernelConfig reduce_config;
-      if (CubDebug(
-            error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(
-              reduce_kernel)))
+      error = CubDebug(reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -764,10 +766,9 @@ struct DispatchReduce : SelectedPolicy
 
       // Alias the temporary allocations from the single storage blob (or
       // compute the necessary size of the blob)
-      if (CubDebug(error = AliasTemporaries(d_temp_storage,
-                                            temp_storage_bytes,
-                                            allocations,
-                                            allocation_sizes)))
+      error = CubDebug(
+        AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -810,14 +811,15 @@ struct DispatchReduce : SelectedPolicy
               reduction_op);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -845,14 +847,15 @@ struct DispatchReduce : SelectedPolicy
               init);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -956,7 +959,8 @@ struct DispatchReduce : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -973,7 +977,8 @@ struct DispatchReduce : SelectedPolicy
                               ptx_version);
 
       // Dispatch to chained policy
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1199,10 +1204,9 @@ struct DispatchSegmentedReduce : SelectedPolicy
 
       // Init kernel configuration
       KernelConfig segmented_reduce_config;
-      if (CubDebug(
-            error = segmented_reduce_config
-                      .Init<typename ActivePolicyT::SegmentedReducePolicy>(
-                        segmented_reduce_kernel)))
+      error = CubDebug(segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(
+        segmented_reduce_kernel));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1234,14 +1238,15 @@ struct DispatchSegmentedReduce : SelectedPolicy
               init);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1341,7 +1346,8 @@ struct DispatchSegmentedReduce : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1360,7 +1366,8 @@ struct DispatchSegmentedReduce : SelectedPolicy
                                        ptx_version);
 
       // Dispatch to chained policy
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index d56f4cb4d02..a29c1376a41 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -295,7 +295,8 @@ struct DispatchReduceByKey
     {
       // Get device ordinal
       int device_ordinal;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -306,7 +307,8 @@ struct DispatchReduceByKey
 
       // Specify temporary storage allocation requirements
       size_t allocation_sizes[1];
-      if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])))
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break; // bytes needed for tile status descriptors
       }
@@ -314,9 +316,10 @@ struct DispatchReduceByKey
       // Compute allocation pointers into the single storage blob (or compute
       // the necessary size of the blob)
       void *allocations[1] = {};
-      if (CubDebug(
-            error =
-              AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+
+      error = CubDebug(
+        AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -330,7 +333,8 @@ struct DispatchReduceByKey
 
       // Construct the tile status interface
       ScanTileStateT tile_state;
-      if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])))
+      error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -353,14 +357,15 @@ struct DispatchReduceByKey
         .doit(init_kernel, tile_state, num_tiles, d_num_runs_out);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -373,17 +378,18 @@ struct DispatchReduceByKey
 
       // Get SM occupancy for reduce_by_key_kernel
       int reduce_by_key_sm_occupancy;
-      if (CubDebug(error = MaxSmOccupancy(reduce_by_key_sm_occupancy,
-                                          reduce_by_key_kernel,
-                                          block_threads)))
+      error =
+        CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads));
+
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get max x-dimension of grid
       int max_dim_x;
-      if (CubDebug(
-            error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -422,14 +428,15 @@ struct DispatchReduceByKey
                 num_items);
 
         // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
         {
           break;
         }
 
         // Sync the stream if specified to flush runtime errors
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -517,7 +524,8 @@ struct DispatchReduceByKey
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -535,7 +543,8 @@ struct DispatchReduceByKey
                                    stream);
 
       // Dispatch
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 6b978fa95fc..06c6fc90f7b 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -298,8 +298,11 @@ struct DeviceRleDispatch
     {
       // Get device ordinal
       int device_ordinal;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
+      {
         break;
+      }
 
       // Number of input tiles
       int tile_size = block_threads * items_per_thread;
@@ -307,7 +310,8 @@ struct DeviceRleDispatch
 
       // Specify temporary storage allocation requirements
       size_t allocation_sizes[1];
-      if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])))
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break; // bytes needed for tile status descriptors
       }
@@ -315,9 +319,10 @@ struct DeviceRleDispatch
       // Compute allocation pointers into the single storage blob (or compute the necessary size of
       // the blob)
       void *allocations[1] = {};
-      if (CubDebug(
-            error =
-              AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
+
+      error = CubDebug(
+        AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (error != cudaSuccess)
       {
         break;
       }
@@ -330,7 +335,8 @@ struct DeviceRleDispatch
 
       // Construct the tile status interface
       ScanTileStateT tile_status;
-      if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])))
+      error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -353,14 +359,15 @@ struct DeviceRleDispatch
         .doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -373,17 +380,18 @@ struct DeviceRleDispatch
 
       // Get SM occupancy for device_rle_sweep_kernel
       int device_rle_kernel_sm_occupancy;
-      if (CubDebug(error = MaxSmOccupancy(device_rle_kernel_sm_occupancy, // out
-                                          device_rle_sweep_kernel,
-                                          block_threads)))
+      error = CubDebug(MaxSmOccupancy(device_rle_kernel_sm_occupancy, // out
+                                      device_rle_sweep_kernel,
+                                      block_threads));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get max x-dimension of grid
       int max_dim_x;
-      if (CubDebug(
-            error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -423,14 +431,15 @@ struct DeviceRleDispatch
               num_tiles);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -506,7 +515,8 @@ struct DeviceRleDispatch
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -522,7 +532,8 @@ struct DeviceRleDispatch
                                  stream);
 
       // Dispatch
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index a4bf2eff5fa..f16f1c0fd96 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -365,7 +365,8 @@ struct DispatchScan : SelectedPolicy
     {
       // Get device ordinal
       int device_ordinal;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -377,8 +378,8 @@ struct DispatchScan : SelectedPolicy
 
       // Specify temporary storage allocation requirements
       size_t allocation_sizes[1];
-      if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles,
-                                                          allocation_sizes[0])))
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break; // bytes needed for tile status descriptors
       }
@@ -386,10 +387,10 @@ struct DispatchScan : SelectedPolicy
       // Compute allocation pointers into the single storage blob (or compute
       // the necessary size of the blob)
       void *allocations[1] = {};
-      if (CubDebug(error = AliasTemporaries(d_temp_storage,
-                                            temp_storage_bytes,
-                                            allocations,
-                                            allocation_sizes)))
+
+      error = CubDebug(
+        AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -409,9 +410,8 @@ struct DispatchScan : SelectedPolicy
 
       // Construct the tile status interface
       ScanTileStateT tile_state;
-      if (CubDebug(error = tile_state.Init(num_tiles,
-                                           allocations[0],
-                                           allocation_sizes[0])))
+      error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -436,32 +436,33 @@ struct DispatchScan : SelectedPolicy
         .doit(init_kernel, tile_state, num_tiles);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get SM occupancy for scan_kernel
       int scan_sm_occupancy;
-      if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out
-                                          scan_kernel,
-                                          Policy::BLOCK_THREADS)))
+      error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
+                                      scan_kernel,
+                                      Policy::BLOCK_THREADS));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get max x-dimension of grid
       int max_dim_x;
-      if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x,
-                                                  cudaDevAttrMaxGridDimX,
-                                                  device_ordinal)))
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -499,14 +500,15 @@ struct DispatchScan : SelectedPolicy
                 num_items);
 
         // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
         {
           break;
         }
 
         // Sync the stream if specified to flush runtime errors
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -581,7 +583,8 @@ struct DispatchScan : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -598,7 +601,8 @@ struct DispatchScan : SelectedPolicy
                             ptx_version);
 
       // Dispatch to chained policy
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 2139721f734..b70e49be272 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -379,7 +379,8 @@ struct DispatchScanByKey : SelectedPolicy
     {
       // Get device ordinal
       int device_ordinal;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -391,9 +392,8 @@ struct DispatchScanByKey : SelectedPolicy
 
       // Specify temporary storage allocation requirements
       size_t allocation_sizes[2];
-      if (CubDebug(
-            error = ScanByKeyTileStateT::AllocationSize(num_tiles,
-                                                        allocation_sizes[0])))
+      error = CubDebug(ScanByKeyTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break; // bytes needed for tile status descriptors
       }
@@ -403,10 +403,10 @@ struct DispatchScanByKey : SelectedPolicy
       // Compute allocation pointers into the single storage blob (or compute
       // the necessary size of the blob)
       void *allocations[2] = {};
-      if (CubDebug(error = AliasTemporaries(d_temp_storage,
-                                            temp_storage_bytes,
-                                            allocations,
-                                            allocation_sizes)))
+
+      error = CubDebug(
+        AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -428,9 +428,8 @@ struct DispatchScanByKey : SelectedPolicy
 
       // Construct the tile status interface
       ScanByKeyTileStateT tile_state;
-      if (CubDebug(error = tile_state.Init(num_tiles,
-                                           allocations[0],
-                                           allocation_sizes[0])))
+      error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -459,33 +458,34 @@ struct DispatchScanByKey : SelectedPolicy
               num_tiles);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
       
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get SM occupancy for scan_kernel
       int scan_sm_occupancy;
-      if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out
-                                          scan_kernel,
-                                          Policy::BLOCK_THREADS)))
+      error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
+                                      scan_kernel,
+                                      Policy::BLOCK_THREADS));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get max x-dimension of grid
       int max_dim_x;
-      if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x,
-                                                  cudaDevAttrMaxGridDimX,
-                                                  device_ordinal)))
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -526,14 +526,15 @@ struct DispatchScanByKey : SelectedPolicy
                 num_items);
 
         // Check for failure to launch
-        if (CubDebug(error = cudaPeekAtLastError()))
+        error = CubDebug(cudaPeekAtLastError());
+        if (cudaSuccess != error)
         {
           break;
         }
 
         // Sync the stream if specified to flush runtime errors
-        error = detail::DebugSyncStream(stream);
-        if (CubDebug(error))
+        error = CubDebug(detail::DebugSyncStream(stream));
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -619,7 +620,8 @@ struct DispatchScanByKey : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -638,7 +640,8 @@ struct DispatchScanByKey : SelectedPolicy
                                  ptx_version);
 
       // Dispatch to chained policy
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index d00e998f165..2eec9290bb9 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -599,14 +599,15 @@ DeviceSegmentedSortContinuation(
             d_end_offsets);
 
     // Check for failure to launch
-    if (CubDebug(error = cudaPeekAtLastError()))
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Sync the stream if specified to flush runtime errors
-    error = detail::DebugSyncStream(stream);
-    if (CubDebug(error))
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -657,14 +658,15 @@ DeviceSegmentedSortContinuation(
             d_end_offsets);
 
     // Check for failure to launch
-    if (CubDebug(error = cudaPeekAtLastError()))
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Sync the stream if specified to flush runtime errors
-    error = detail::DebugSyncStream(stream);
-    if (CubDebug(error))
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -734,7 +736,7 @@ DeviceSegmentedSortContinuationKernel(
       small_segments_indices,
       0); // always launching on the main stream (see motivation above)
 
-  CubDebug(error);
+  error = CubDebug(error);
 }
 #endif // CUB_RDC_ENABLED
 
@@ -1357,9 +1359,8 @@ struct DispatchSegmentedSort : SelectedPolicy
         break;
       }
 
-      if (CubDebug(
-            error = temporary_storage_layout.map_to_buffer(d_temp_storage,
-                                                           temp_storage_bytes)))
+      error = CubDebug(temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1500,7 +1501,8 @@ struct DispatchSegmentedSort : SelectedPolicy
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = PtxVersion(ptx_version)))
+      error = CubDebug(PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1518,7 +1520,8 @@ struct DispatchSegmentedSort : SelectedPolicy
                                      stream);
 
       // Dispatch to chained policy
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -1610,19 +1613,18 @@ private:
       THRUST_NS_QUALIFIER::make_reverse_iterator(
         large_and_medium_segments_indices.get() + num_segments);
 
-    error = cub::DevicePartition::If(
-      device_partition_temp_storage.get(),
-      three_way_partition_temp_storage_bytes,
-      THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
-      large_and_medium_segments_indices.get(),
-      small_segments_indices.get(),
-      medium_indices_iterator,
-      group_sizes.get(),
-      num_segments,
-      large_segments_selector,
-      small_segments_selector,
-      stream);
-    if (CubDebug(error))
+    error = CubDebug(cub::DevicePartition::If(device_partition_temp_storage.get(),
+                                              three_way_partition_temp_storage_bytes,
+                                              THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
+                                              large_and_medium_segments_indices.get(),
+                                              small_segments_indices.get(),
+                                              medium_indices_iterator,
+                                              group_sizes.get(),
+                                              num_segments,
+                                              large_segments_selector,
+                                              small_segments_selector,
+                                              stream));
+    if (cudaSuccess != error)
     {
       return error;
     }
@@ -1661,14 +1663,15 @@ private:
             group_sizes.get(),                                                 \
             large_and_medium_segments_indices.get(),                           \
             small_segments_indices.get());                                     \
+  error = CubDebug(error);                                                     \
                                                                                \
-  if (CubDebug(error))                                                         \
+  if (cudaSuccess != error)                                                    \
   {                                                                            \
     return error;                                                              \
   }                                                                            \
                                                                                \
-  error = detail::DebugSyncStream(stream);                                     \
-  if (CubDebug(error))                                                         \
+  error = CubDebug(detail::DebugSyncStream(stream));                           \
+  if (cudaSuccess != error)                                                    \
   {                                                                            \
     return error;                                                              \
   }
@@ -1681,18 +1684,20 @@ private:
       NV_IS_HOST,
       (
         unsigned int h_group_sizes[num_selected_groups];
-
-        if (CubDebug(error = cudaMemcpyAsync(h_group_sizes,
+        error = CubDebug(cudaMemcpyAsync(h_group_sizes,
                                              group_sizes.get(),
                                              num_selected_groups *
                                                sizeof(unsigned int),
                                              cudaMemcpyDeviceToHost,
-                                             stream)))
+                                             stream));
+
+        if (cudaSuccess != error)
         {
             return error;
         }
 
-        if (CubDebug(error = SyncStream(stream)))
+        error = CubDebug(SyncStream(stream));
+        if (cudaSuccess != error)
         {
           return error;
         }
@@ -1763,14 +1768,15 @@ private:
             d_end_offsets);
 
     // Check for failure to launch
-    if (CubDebug(error = cudaPeekAtLastError()))
+    error = CubDebug(cudaPeekAtLastError());
+    if (cudaSuccess != error)
     {
       return error;
     }
 
     // Sync the stream if specified to flush runtime errors
-    error = detail::DebugSyncStream(stream);
-    if (CubDebug(error))
+    error = CubDebug(detail::DebugSyncStream(stream));
+    if (cudaSuccess != error)
     {
       return error;
     }
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index a2f3536ed2e..56fa86e2ad9 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -338,7 +338,8 @@ struct DispatchSelectIf : SelectedPolicy
         {
             // Get device ordinal
             int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) 
+            error = CubDebug(cudaGetDevice(&device_ordinal));
+            if (cudaSuccess != error) 
             {
                 break;
             }
@@ -350,14 +351,17 @@ struct DispatchSelectIf : SelectedPolicy
             size_t  allocation_sizes[1];
 
             // bytes needed for tile status descriptors
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) 
+            error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+            if (cudaSuccess != error) 
             {
                 break;
             }
 
             // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
             void* allocations[1] = {};
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) 
+
+            error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+            if (cudaSuccess != error) 
             {
                 break;
             }
@@ -370,7 +374,8 @@ struct DispatchSelectIf : SelectedPolicy
 
             // Construct the tile status interface
             ScanTileStateT tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) 
+            error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
+            if (cudaSuccess != error) 
             {
                 break;
             }
@@ -391,14 +396,15 @@ struct DispatchSelectIf : SelectedPolicy
                 d_num_selected_out);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError()))
+            error = CubDebug(cudaPeekAtLastError());
+            if (cudaSuccess != error)
             {
                 break;
             }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -411,7 +417,8 @@ struct DispatchSelectIf : SelectedPolicy
 
             // Get max x-dimension of grid
             int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) 
+            error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+            if (cudaSuccess != error) 
             {
                 break;
             }
@@ -427,9 +434,10 @@ struct DispatchSelectIf : SelectedPolicy
             {
               // Get SM occupancy for select_if_kernel
               int range_select_sm_occupancy;
-              if (CubDebug(error = MaxSmOccupancy(range_select_sm_occupancy, // out
-                                                  select_if_kernel,
-                                                  block_threads)))
+              error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out
+                                              select_if_kernel,
+                                              block_threads));
+              if (cudaSuccess != error)
               {
                 break;
               }
@@ -461,14 +469,15 @@ struct DispatchSelectIf : SelectedPolicy
                 num_tiles);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError()))
+            error = CubDebug(cudaPeekAtLastError());
+            if (cudaSuccess != error)
             {
                 break;
             }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index ed22bc86b12..2277956e24c 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -221,7 +221,8 @@ struct DispatchThreeWayPartitionIf
     {
       // Get device ordinal
       int device_ordinal;
-      if (CubDebug(error = cudaGetDevice(&device_ordinal)))
+      error = CubDebug(cudaGetDevice(&device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -233,7 +234,8 @@ struct DispatchThreeWayPartitionIf
       // Specify temporary storage allocation requirements
       size_t allocation_sizes[1]; // bytes needed for tile status descriptors
 
-      if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])))
+      error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -241,10 +243,10 @@ struct DispatchThreeWayPartitionIf
       // Compute allocation pointers into the single storage blob (or compute
       // the necessary size of the blob)
       void *allocations[1] = {};
-      if (CubDebug(error = cub::AliasTemporaries(d_temp_storage,
-                                                 temp_storage_bytes,
-                                                 allocations,
-                                                 allocation_sizes)))
+
+      error = CubDebug(
+        cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -265,7 +267,8 @@ struct DispatchThreeWayPartitionIf
       // Construct the tile status interface
       ScanTileStateT tile_status;
 
-      if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])))
+      error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -291,22 +294,23 @@ struct DispatchThreeWayPartitionIf
               d_num_selected_out);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Get max x-dimension of grid
       int max_dim_x;
-      if (CubDebug(
-            error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)))
+      error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -322,9 +326,10 @@ struct DispatchThreeWayPartitionIf
       {
         // Get SM occupancy for select_if_kernel
         int range_select_sm_occupancy;
-        if (CubDebug(error = MaxSmOccupancy(range_select_sm_occupancy, // out
-                                            three_way_partition_kernel,
-                                            block_threads)))
+        error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out
+                                        three_way_partition_kernel,
+                                        block_threads));
+        if (cudaSuccess != error)
         {
           break;
         }
@@ -360,14 +365,15 @@ struct DispatchThreeWayPartitionIf
               num_tiles);
 
       // Check for failure to launch
-      if (CubDebug(error = cudaPeekAtLastError()))
+      error = CubDebug(cudaPeekAtLastError());
+      if (cudaSuccess != error)
       {
         break;
       }
 
       // Sync the stream if specified to flush runtime errors
-      error = detail::DebugSyncStream(stream);
-      if (CubDebug(error))
+      error = CubDebug(detail::DebugSyncStream(stream));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -419,7 +425,8 @@ struct DispatchThreeWayPartitionIf
     {
       // Get PTX version
       int ptx_version = 0;
-      if (CubDebug(error = cub::PtxVersion(ptx_version)))
+      error = CubDebug(cub::PtxVersion(ptx_version));
+      if (cudaSuccess != error)
       {
         break;
       }
@@ -437,7 +444,8 @@ struct DispatchThreeWayPartitionIf
                                            stream);
 
       // Dispatch
-      if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch)))
+      error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+      if (cudaSuccess != error)
       {
         break;
       }
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index 07aeb59526f..e70d28f2291 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -216,7 +216,11 @@ struct DispatchUniqueByKey : SelectedPolicy
         {
             // Get device ordinal
             int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+            error = CubDebug(cudaGetDevice(&device_ordinal));
+            if (cudaSuccess != error)
+            {
+              break;
+            }
 
             // Number of input tiles
             int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD;
@@ -224,10 +228,11 @@ struct DispatchUniqueByKey : SelectedPolicy
 
             // Size of virtual shared memory
             int max_shmem = 0;
-            if (CubDebug(
-                error = cudaDeviceGetAttribute(&max_shmem,
-                                               cudaDevAttrMaxSharedMemoryPerBlock,
-                                               device_ordinal)))
+
+            error = CubDebug(cudaDeviceGetAttribute(&max_shmem,
+                                                    cudaDevAttrMaxSharedMemoryPerBlock,
+                                                    device_ordinal));
+            if (cudaSuccess != error)
             {
                 break;
             }
@@ -235,11 +240,24 @@ struct DispatchUniqueByKey : SelectedPolicy
 
             // Specify temporary storage allocation requirements
             size_t allocation_sizes[2] = {0, vshmem_size};
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Bytes needed for tile status descriptors
+            error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]));
+            if (cudaSuccess != error)
+            {
+                break;
+            }
 
             // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
             void *allocations[2] = {NULL, NULL};
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            error = CubDebug(
+              AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
+
             if (d_temp_storage == NULL)
             {
                 // Return if the caller is simply requesting the size of the storage allocation
@@ -248,7 +266,11 @@ struct DispatchUniqueByKey : SelectedPolicy
 
             // Construct the tile status interface
             ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+            error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]));
+            if (cudaSuccess != error)
+            {
+                break;
+            }
 
             // Log init_kernel configuration
             num_tiles = CUB_MAX(1, num_tiles);
@@ -264,11 +286,15 @@ struct DispatchUniqueByKey : SelectedPolicy
             ).doit(init_kernel, tile_state, num_tiles, d_num_selected_out);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            error = CubDebug(cudaPeekAtLastError());
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -278,7 +304,12 @@ struct DispatchUniqueByKey : SelectedPolicy
 
             // Get max x-dimension of grid
             int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;
+            error =
+              CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Get grid size for scanning tiles
             dim3 scan_grid_size;
@@ -291,9 +322,10 @@ struct DispatchUniqueByKey : SelectedPolicy
             {
               // Get SM occupancy for unique_by_key_kernel
               int scan_sm_occupancy;
-              if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out
-                                                  scan_kernel,
-                                                  Policy::BLOCK_THREADS)))
+              error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out
+                                              scan_kernel,
+                                              Policy::BLOCK_THREADS));
+              if (cudaSuccess != error)
               {
                 break;
               }
@@ -325,14 +357,15 @@ struct DispatchUniqueByKey : SelectedPolicy
                    num_tiles);
 
             // Check for failure to launch
-            if (CubDebug(error))
+            error = CubDebug(error);
+            if (cudaSuccess != error)
             {
               break;
             }
 
             // Sync the stream if specified to flush runtime errors
-            error = detail::DebugSyncStream(stream);
-            if (CubDebug(error))
+            error = CubDebug(detail::DebugSyncStream(stream));
+            if (cudaSuccess != error)
             {
               break;
             }
@@ -388,7 +421,11 @@ struct DispatchUniqueByKey : SelectedPolicy
         {
             // Get PTX version
             int ptx_version = 0;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
+            error = CubDebug(PtxVersion(ptx_version));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Create dispatch functor
             DispatchUniqueByKey dispatch(
@@ -404,7 +441,11 @@ struct DispatchUniqueByKey : SelectedPolicy
                 stream);
 
             // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+            error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
         }
         while (0);
 
diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh
index ca946f3cc84..063a2c395cd 100644
--- a/cub/cub/grid/grid_barrier.cuh
+++ b/cub/cub/grid/grid_barrier.cuh
@@ -154,7 +154,7 @@ public:
         cudaError_t retval = cudaSuccess;
         if (d_sync)
         {
-            CubDebug(retval = cudaFree(d_sync));
+            retval = CubDebug(cudaFree(d_sync));
             d_sync = NULL;
         }
         sync_bytes = 0;
@@ -184,14 +184,27 @@ public:
             {
                 if (d_sync)
                 {
-                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                    retval = CubDebug(cudaFree(d_sync));
+                    if (cudaSuccess != retval)
+                    {
+                      break;
+                    }
                 }
 
                 sync_bytes = new_sync_bytes;
 
                 // Allocate and initialize to zero
-                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
-                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+                retval = CubDebug(cudaMalloc((void**) &d_sync, sync_bytes));
+                if (cudaSuccess != retval) 
+                {
+                    break;
+                }
+
+                retval = CubDebug(cudaMemset(d_sync, 0, new_sync_bytes));
+                if (cudaSuccess != retval) 
+                {
+                    break;
+                }
             }
         } while (0);
 
diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh
index bb6d6a41784..20c0a2ad80a 100644
--- a/cub/cub/util_allocator.cuh
+++ b/cub/cub/util_allocator.cuh
@@ -361,7 +361,12 @@ struct CachingDeviceAllocator
 
         if (device == INVALID_DEVICE_ORDINAL)
         {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            error = CubDebug(cudaGetDevice(&entrypoint_device));
+            if (cudaSuccess != error)
+            {
+                return error;
+            }
+
             device = entrypoint_device;
         }
 
@@ -447,12 +452,22 @@ struct CachingDeviceAllocator
             // Set runtime's current device to specified device (entrypoint may not be set)
             if (device != entrypoint_device)
             {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-                if (CubDebug(error = cudaSetDevice(device))) return error;
+                error = CubDebug(cudaGetDevice(&entrypoint_device));
+                if (cudaSuccess != error) 
+                {
+                    return error;
+                }
+
+                error = CubDebug(cudaSetDevice(device));
+                if (cudaSuccess != error) 
+                {
+                    return error;
+                }
             }
 
             // Attempt to allocate
-            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes));
+            if (error == cudaErrorMemoryAllocation)
             {
                 // The allocation attempt failed: free all cached blocks on device and retry
                 if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
@@ -475,8 +490,17 @@ struct CachingDeviceAllocator
                     // on the current device
 
                     // Free device memory and destroy stream event.
-                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
-                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+                    error = CubDebug(cudaFree(block_itr->d_ptr));
+                    if (cudaSuccess != error) 
+                    {
+                        break;
+                    }
+
+                    error = CubDebug(cudaEventDestroy(block_itr->ready_event));
+                    if (cudaSuccess != error) 
+                    {
+                        break;
+                    }
 
                     // Reduce balance and erase entry
                     cached_bytes[device].free -= block_itr->bytes;
@@ -494,12 +518,21 @@ struct CachingDeviceAllocator
                 if (error) return error;
 
                 // Try to allocate again
-                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+                error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes));
+                if (cudaSuccess != error) 
+                {
+                    return error;
+                }
             }
 
             // Create ready event
-            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+            error =
+              CubDebug(cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming));
+
+            if (cudaSuccess != error)
+            {
                 return error;
+            }
 
             // Insert into live blocks
             mutex.lock();
@@ -513,7 +546,11 @@ struct CachingDeviceAllocator
             // Attempt to revert back to previous device if necessary
             if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
             {
-                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+                error = CubDebug(cudaSetDevice(entrypoint_device));
+                if (cudaSuccess != error) 
+                {
+                    return error;
+                }
             }
         }
 
@@ -559,8 +596,11 @@ struct CachingDeviceAllocator
 
         if (device == INVALID_DEVICE_ORDINAL)
         {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+            error = CubDebug(cudaGetDevice(&entrypoint_device));
+            if (cudaSuccess != error)
+            {
                 return error;
+            }
             device = entrypoint_device;
         }
 
@@ -598,21 +638,43 @@ struct CachingDeviceAllocator
         // First set to specified device (entrypoint may not be set)
         if (device != entrypoint_device)
         {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-            if (CubDebug(error = cudaSetDevice(device))) return error;
+            error = CubDebug(cudaGetDevice(&entrypoint_device));
+            if (cudaSuccess != error) 
+            {
+                return error;
+            }
+
+            error = CubDebug(cudaSetDevice(device));
+            if (cudaSuccess != error) 
+            {
+                return error;
+            }
         }
 
         if (recached)
         {
             // Insert the ready event in the associated stream (must have current device set properly)
-            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+            error = CubDebug(cudaEventRecord(search_key.ready_event, search_key.associated_stream));
+            if (cudaSuccess != error) 
+            {
+                return error;
+            }
         }
 
         if (!recached)
         {
             // Free the allocation from the runtime and cleanup the event.
-            if (CubDebug(error = cudaFree(d_ptr))) return error;
-            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+            error = CubDebug(cudaFree(d_ptr));
+            if (cudaSuccess != error) 
+            {
+                return error;
+            }
+
+            error = CubDebug(cudaEventDestroy(search_key.ready_event));
+            if (cudaSuccess != error) 
+            {
+                return error;
+            }
 
             if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
                 device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
@@ -621,7 +683,11 @@ struct CachingDeviceAllocator
         // Reset device
         if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
         {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            error = CubDebug(cudaSetDevice(entrypoint_device));
+            if (cudaSuccess != error) 
+            {
+                return error;
+            }
         }
 
         return error;
@@ -661,19 +727,36 @@ struct CachingDeviceAllocator
             // Get entry-point device ordinal if necessary
             if (entrypoint_device == INVALID_DEVICE_ORDINAL)
             {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+                error = CubDebug(cudaGetDevice(&entrypoint_device));
+                if (cudaSuccess != error) 
+                {
+                    break;
+                }
             }
 
             // Set current device ordinal if necessary
             if (begin->device != current_device)
             {
-                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                error = CubDebug(cudaSetDevice(begin->device));
+                if (cudaSuccess != error) 
+                {
+                    break;
+                }
                 current_device = begin->device;
             }
 
             // Free device memory
-            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
-            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+            error = CubDebug(cudaFree(begin->d_ptr));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
+
+            error = CubDebug(cudaEventDestroy(begin->ready_event));
+            if (cudaSuccess != error) 
+            {
+                break;
+            }
 
             // Reduce balance and erase entry
             const size_t block_bytes = begin->bytes;
@@ -690,7 +773,11 @@ struct CachingDeviceAllocator
         // Attempt to revert back to entry-point device if necessary
         if (entrypoint_device != INVALID_DEVICE_ORDINAL)
         {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            error = CubDebug(cudaSetDevice(entrypoint_device));
+            if (cudaSuccess != error)
+            {
+                return error;
+            }
         }
 
         return error;
diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh
index 80b2bd336ed..5023524dd6e 100644
--- a/cub/cub/util_debug.cuh
+++ b/cub/cub/util_debug.cuh
@@ -190,18 +190,25 @@ cudaError_t Debug(cudaError_t error, const char *filename, int line)
   #ifndef CUB_RDC_ENABLED
   #define CUB_TEMP_DEVICE_CODE
   #else
-  #define CUB_TEMP_DEVICE_CODE cudaGetLastError()
+  #define CUB_TEMP_DEVICE_CODE last_error = cudaGetLastError()
   #endif
 
+  cudaError_t last_error = cudaSuccess;
+
   NV_IF_TARGET(
     NV_IS_HOST, 
-    (cudaGetLastError();),
+    (last_error = cudaGetLastError();),
     (CUB_TEMP_DEVICE_CODE;)
   );
   
   #undef CUB_TEMP_DEVICE_CODE
   // clang-format on
 
+  if (error == cudaSuccess && last_error != cudaSuccess)
+  {
+    error = last_error;
+  }
+
 #ifdef CUB_STDERR
   if (error)
   {
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index cc371826a91..d8caaedbb43 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -305,9 +305,8 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
       (
         cudaFuncAttributes empty_kernel_attrs;
 
-        result = cudaFuncGetAttributes(&empty_kernel_attrs,
-                                       reinterpret_cast<void*>(empty_kernel));
-        CubDebug(result);
+        result = CubDebug(cudaFuncGetAttributes(&empty_kernel_attrs,
+                                                reinterpret_cast<void*>(empty_kernel)));
 
         ptx_version = empty_kernel_attrs.ptxVersion * 10;
       ),
@@ -412,8 +411,17 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d
     do
     {
         int major = 0, minor = 0;
-        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break;
-        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break;
+        error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
+        if (cudaSuccess != error) 
+        {
+            break;
+        }
+
+        error = CubDebug(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
+        if (cudaSuccess != error)
+        {
+            break;
+        }
         sm_version = major * 100 + minor * 10;
     }
     while (0);
@@ -535,10 +543,15 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
     has_uva = false;
     cudaError_t error = cudaSuccess;
     int device = -1;
-    if (CubDebug(error = cudaGetDevice(&device)) != cudaSuccess) return error;
+    error = CubDebug(cudaGetDevice(&device));
+    if (cudaSuccess != error) 
+    {
+        return error;
+    }
+
     int uva = 0;
-    if (CubDebug(error = cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device))
-        != cudaSuccess)
+    error = CubDebug(cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device));
+    if (cudaSuccess != error)
     {
         return error;
     }
diff --git a/cub/docs/developer_overview.rst b/cub/docs/developer_overview.rst
index 24adfa42109..c60886d0d0d 100644
--- a/cub/docs/developer_overview.rst
+++ b/cub/docs/developer_overview.rst
@@ -523,7 +523,11 @@ The dispatch entry point is typically represented by a static member function th
         do {
           // Get PTX version
           int ptx_version = 0;
-          if (CubDebug(error = PtxVersion(ptx_version))) break;
+          error = CubDebug(PtxVersion(ptx_version));
+          if (cudaSuccess != error) 
+          {
+            break;
+          }
 
           // Create dispatch functor
           DispatchSegmentedReduce dispatch(
diff --git a/cub/test/catch2_test_debug.cu b/cub/test/catch2_test_debug.cu
new file mode 100644
index 00000000000..12771934db4
--- /dev/null
+++ b/cub/test/catch2_test_debug.cu
@@ -0,0 +1,37 @@
+#include <cub/util_debug.cuh>
+#include <cub/util_device.cuh>
+
+#include "catch2_test_helper.h"
+
+TEST_CASE("CubDebug returns input error", "[debug][utils]")
+{
+  REQUIRE( CubDebug(cudaSuccess) == cudaSuccess );
+  REQUIRE( CubDebug(cudaErrorInvalidConfiguration) == cudaErrorInvalidConfiguration );
+}
+
+TEST_CASE("CubDebug returns new errors", "[debug][utils]")
+{
+  cub::EmptyKernel<int><<<0, 0>>>();
+  cudaError error = cudaPeekAtLastError();
+
+  REQUIRE( error != cudaSuccess );
+  REQUIRE( CubDebug(cudaSuccess) != cudaSuccess );
+}
+
+TEST_CASE("CubDebug prefers input errors", "[debug][utils]")
+{
+  cub::EmptyKernel<int><<<0, 0>>>();
+  cudaError error = cudaPeekAtLastError();
+
+  REQUIRE( error != cudaSuccess );
+  REQUIRE( CubDebug(cudaErrorMemoryAllocation) != cudaSuccess );
+}
+
+TEST_CASE("CubDebug resets last error", "[debug][utils]")
+{
+  cub::EmptyKernel<int><<<0, 0>>>();
+  cudaError error = cudaPeekAtLastError();
+
+  REQUIRE( CubDebug(cudaSuccess) != cudaSuccess );
+  REQUIRE( CubDebug(cudaSuccess) == cudaSuccess );
+}