From 2b080665ec409cd72a4e30fb7daa15b1bce30b0b Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Fri, 11 Aug 2023 20:38:48 +0000 Subject: [PATCH] Fix CubDebug --- cub/cub/agent/single_pass_scan_operators.cuh | 8 +- .../dispatch/dispatch_adjacent_difference.cuh | 28 +- .../device/dispatch/dispatch_batch_memcpy.cuh | 63 ++-- .../device/dispatch/dispatch_histogram.cuh | 67 ++-- .../device/dispatch/dispatch_merge_sort.cuh | 48 +-- .../device/dispatch/dispatch_radix_sort.cuh | 296 +++++++++++++----- cub/cub/device/dispatch/dispatch_reduce.cuh | 71 +++-- .../dispatch/dispatch_reduce_by_key.cuh | 47 +-- cub/cub/device/dispatch/dispatch_rle.cuh | 49 +-- cub/cub/device/dispatch/dispatch_scan.cuh | 52 +-- .../device/dispatch/dispatch_scan_by_key.cuh | 53 ++-- .../dispatch/dispatch_segmented_sort.cuh | 76 ++--- .../device/dispatch/dispatch_select_if.cuh | 37 ++- .../dispatch/dispatch_three_way_partition.cuh | 48 +-- .../dispatch/dispatch_unique_by_key.cuh | 81 +++-- cub/cub/grid/grid_barrier.cuh | 21 +- cub/cub/util_allocator.cuh | 129 ++++++-- cub/cub/util_debug.cuh | 11 +- cub/cub/util_device.cuh | 29 +- cub/docs/developer_overview.rst | 6 +- cub/test/catch2_test_debug.cu | 37 +++ 21 files changed, 843 insertions(+), 414 deletions(-) create mode 100644 cub/test/catch2_test_debug.cu diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index 7dceb902758..342e859246c 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -701,7 +701,13 @@ struct ScanTileState allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives // Compute allocation pointers into the single storage blob - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + + if (cudaSuccess != error) + { + break; + } // Alias the offsets d_tile_status = reinterpret_cast(allocations[0]); diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 39d58de2429..25f5c25e7f4 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -221,10 +221,10 @@ struct DispatchAdjacentDifference : public SelectedPolicy void *allocations[1] = {nullptr}; std::size_t allocation_sizes[1] = {MayAlias * first_tile_previous_size}; - if (CubDebug(error = AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes))) + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + + if (cudaSuccess != error) { break; } @@ -278,15 +278,16 @@ struct DispatchAdjacentDifference : public SelectedPolicy num_tiles, tile_size); - error = detail::DebugSyncStream(stream); + error = CubDebug(detail::DebugSyncStream(stream)); - if (CubDebug(error)) + if (cudaSuccess != error) { break; } // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } @@ -319,15 +320,16 @@ struct DispatchAdjacentDifference : public SelectedPolicy difference_op, num_items); - error = detail::DebugSyncStream(stream); + error = CubDebug(detail::DebugSyncStream(stream)); - if (CubDebug(error)) + if (cudaSuccess != error) { break; } // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } @@ -352,7 +354,8 @@ struct DispatchAdjacentDifference : public SelectedPolicy { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -367,7 +370,8 @@ struct DispatchAdjacentDifference : public SelectedPolicy stream); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index a8b5a1f6550..4dfddce59e2 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -470,13 +470,14 @@ struct DispatchBatchMemcpy : SelectedPolicy std::size_t buffer_offset_scan_storage = 0; std::size_t blev_block_scan_storage = 0; - CubDebug(error = BLevBufferOffsetTileState::AllocationSize(static_cast(num_tiles), + error = CubDebug(BLevBufferOffsetTileState::AllocationSize(static_cast(num_tiles), buffer_offset_scan_storage)); if (error) { return error; } - CubDebug(error = BLevBlockOffsetTileState::AllocationSize(static_cast(num_tiles), + + error = CubDebug(BLevBlockOffsetTileState::AllocationSize(static_cast(num_tiles), blev_block_scan_storage)); if (error) { @@ -504,8 +505,8 @@ struct DispatchBatchMemcpy : SelectedPolicy } // Alias memory buffers into the storage blob - if (CubDebug( - error = temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes))) + error = CubDebug(temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes)); + if (cudaSuccess != error) { return error; } @@ -551,25 +552,26 @@ struct DispatchBatchMemcpy : SelectedPolicy // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) { return error; } // Get SM count int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute(&sm_count, - cudaDevAttrMultiProcessorCount, - device_ordinal))) + error = + CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); + if (cudaSuccess != error) { return error; } // Get SM occupancy for the batch memcpy block-level buffers kernel int batch_memcpy_blev_occupancy; - if (CubDebug(error = MaxSmOccupancy(batch_memcpy_blev_occupancy, - multi_block_memcpy_kernel, - BLEV_BLOCK_THREADS))) + error = CubDebug( + MaxSmOccupancy(batch_memcpy_blev_occupancy, multi_block_memcpy_kernel, BLEV_BLOCK_THREADS)); + if (cudaSuccess != error) { return error; } @@ -579,18 +581,20 @@ struct DispatchBatchMemcpy : SelectedPolicy // Construct the tile status for the buffer prefix sum BLevBufferOffsetTileState buffer_scan_tile_state; - if (CubDebug(error = buffer_scan_tile_state.Init(static_cast(num_tiles), - blev_buffer_scan_alloc.get(), - buffer_offset_scan_storage))) + error = CubDebug(buffer_scan_tile_state.Init(static_cast(num_tiles), + blev_buffer_scan_alloc.get(), + buffer_offset_scan_storage)); + if (cudaSuccess != error) { return error; } // Construct the tile status for thread blocks-to-buffer-assignment prefix sum BLevBlockOffsetTileState block_scan_tile_state; - if (CubDebug(error = block_scan_tile_state.Init(static_cast(num_tiles), - blev_block_scan_alloc.get(), - blev_block_scan_storage))) + error = CubDebug(block_scan_tile_state.Init(static_cast(num_tiles), + blev_block_scan_alloc.get(), + blev_block_scan_storage)); + if (cudaSuccess != error) { return error; } @@ -612,16 +616,17 @@ struct DispatchBatchMemcpy : SelectedPolicy .doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles); // Check for failure to launch - if (CubDebug(error)) + error = CubDebug(error); + if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); + error = CubDebug(detail::DebugSyncStream(stream)); // Check for failure to launch - if (CubDebug(error)) + if (cudaSuccess != error) { return error; } @@ -654,14 +659,15 @@ struct DispatchBatchMemcpy : SelectedPolicy block_scan_tile_state); // Check for failure to launch - if (CubDebug(error)) + error = CubDebug(error); + if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { return error; } @@ -687,13 +693,14 @@ struct DispatchBatchMemcpy : SelectedPolicy batch_memcpy_grid_size - 1); // Check for failure to launch - if (CubDebug(error)) + error = CubDebug(error); + if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); + error = CubDebug(detail::DebugSyncStream(stream)); return error; } @@ -718,7 +725,8 @@ struct DispatchBatchMemcpy : SelectedPolicy // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { return error; } @@ -733,7 +741,8 @@ struct DispatchBatchMemcpy : SelectedPolicy stream); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { return error; } diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 4621be04003..b0b8d6fa879 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -315,25 +315,27 @@ struct dispatch_histogram { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) { break; } // Get SM count int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute(&sm_count, - cudaDevAttrMultiProcessorCount, - device_ordinal))) + error = + CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); + + if (cudaSuccess != error) { break; } // Get SM occupancy for histogram_sweep_kernel int histogram_sweep_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(histogram_sweep_sm_occupancy, - histogram_sweep_kernel, - block_threads))) + error = CubDebug( + MaxSmOccupancy(histogram_sweep_sm_occupancy, histogram_sweep_kernel, block_threads)); + if (cudaSuccess != error) { break; } @@ -378,9 +380,9 @@ struct dispatch_histogram // Alias the temporary allocations from the single storage blob (or compute the // necessary size of the blob) - if (CubDebug( - error = - AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) { break; } @@ -504,14 +506,15 @@ struct dispatch_histogram tile_queue); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -958,7 +961,8 @@ public: { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -1012,7 +1016,8 @@ public: row_stride_samples, stream); - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } @@ -1045,7 +1050,8 @@ public: row_stride_samples, stream); - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } @@ -1152,7 +1158,8 @@ public: { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -1205,7 +1212,8 @@ public: row_stride_samples, stream); - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } @@ -1313,7 +1321,8 @@ public: { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -1330,10 +1339,10 @@ public: for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) { - error = privatized_decode_op[channel].Init(num_output_levels[channel], - upper_level[channel], - lower_level[channel]); - if (CubDebug(error != cudaSuccess)) + error = CubDebug(privatized_decode_op[channel].Init(num_output_levels[channel], + upper_level[channel], + lower_level[channel])); + if (error != cudaSuccess) { // Make sure to also return a reasonable value for `temp_storage_bytes` in case of // an overflow of the bin computation, in which case a subsequent algorithm @@ -1380,7 +1389,8 @@ public: row_stride_samples, stream); - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } @@ -1413,7 +1423,8 @@ public: row_stride_samples, stream); - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } @@ -1525,7 +1536,8 @@ public: { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -1581,7 +1593,8 @@ public: row_stride_samples, stream); - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 5bbd3dfdd78..7b6c73a1ddf 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -570,7 +570,8 @@ struct DispatchMergeSort : SelectedPolicy { // Get device ordinal int device_ordinal = 0; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) { break; } @@ -630,10 +631,11 @@ struct DispatchMergeSort : SelectedPolicy if (runtime_shmem_size_check_is_required) { int max_shmem = 0; - if (CubDebug( - error = cudaDeviceGetAttribute(&max_shmem, - cudaDevAttrMaxSharedMemoryPerBlock, - device_ordinal))) + + error = CubDebug( + cudaDeviceGetAttribute(&max_shmem, cudaDevAttrMaxSharedMemoryPerBlock, device_ordinal)); + + if (cudaSuccess != error) { break; } @@ -656,10 +658,9 @@ struct DispatchMergeSort : SelectedPolicy temporary_values_storage_size, virtual_shared_memory_size}; - if (CubDebug(error = AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes))) + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) { break; } @@ -726,14 +727,15 @@ struct DispatchMergeSort : SelectedPolicy block_sort_launcher.launch(); - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } @@ -789,14 +791,15 @@ struct DispatchMergeSort : SelectedPolicy target_merged_tiles_number, tile_size); - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } @@ -804,14 +807,15 @@ struct DispatchMergeSort : SelectedPolicy // Merge merge_launcher.launch(ping, target_merged_tiles_number); - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } @@ -840,7 +844,10 @@ struct DispatchMergeSort : SelectedPolicy { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + + error = CubDebug(PtxVersion(ptx_version)); + + if (cudaSuccess != error) { break; } @@ -858,7 +865,8 @@ struct DispatchMergeSort : SelectedPolicy ptx_version); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index 84b6ccffd19..9a75e6fe0bb 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1303,14 +1303,15 @@ struct DispatchRadixSort : SelectedPolicy decomposer); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -1373,14 +1374,15 @@ struct DispatchRadixSort : SelectedPolicy decomposer); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -1399,14 +1401,15 @@ struct DispatchRadixSort : SelectedPolicy pass_spine_length); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -1435,14 +1438,15 @@ struct DispatchRadixSort : SelectedPolicy decomposer); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -1498,9 +1502,23 @@ struct DispatchRadixSort : SelectedPolicy radix_bits = DownsweepPolicyT::RADIX_BITS; radix_digits = 1 << radix_bits; - if (CubDebug(error = upsweep_config.Init(upsweep_kernel))) break; - if (CubDebug(error = scan_config.Init(scan_kernel))) break; - if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; + error = CubDebug(upsweep_config.Init(upsweep_kernel)); + if (cudaSuccess != error) + { + break; + } + + error = CubDebug(scan_config.Init(scan_kernel)); + if (cudaSuccess != error) + { + break; + } + + error = CubDebug(downsweep_config.Init(downsweep_kernel)); + if (cudaSuccess != error) + { + break; + } max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(0); @@ -1572,24 +1590,49 @@ struct DispatchRadixSort : SelectedPolicy do { // initialization - if (CubDebug(error = cudaMemsetAsync( - d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream))) break; + error = CubDebug( + cudaMemsetAsync(d_ctrs, 0, num_portions * num_passes * sizeof(AtomicOffsetT), stream)); + if (cudaSuccess != error) + { + break; + } // compute num_passes histograms with RADIX_DIGITS bins each - if (CubDebug(error = cudaMemsetAsync - (d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream))) break; + error = CubDebug( + cudaMemsetAsync(d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream)); + if (cudaSuccess != error) + { + break; + } int device = -1; int num_sms = 0; - if (CubDebug(error = cudaGetDevice(&device))) break; - if (CubDebug(error = cudaDeviceGetAttribute( - &num_sms, cudaDevAttrMultiProcessorCount, device))) break; + + error = CubDebug(cudaGetDevice(&device)); + if (cudaSuccess != error) + { + break; + } + + error = + CubDebug(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); + if (cudaSuccess != error) + { + break; + } const int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS; int histo_blocks_per_sm = 1; auto histogram_kernel = DeviceRadixSortHistogramKernel< MaxPolicyT, IS_DESCENDING, KeyT, OffsetT, DecomposerT>; - if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break; + + error = CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&histo_blocks_per_sm, + histogram_kernel, + HISTO_BLOCK_THREADS, + 0)); + if (cudaSuccess != error) + { + break; + } // log histogram_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG @@ -1605,13 +1648,14 @@ struct DispatchRadixSort : SelectedPolicy histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream ).doit(histogram_kernel, d_bins, d_keys.Current(), num_items, begin_bit, end_bit, decomposer); - if (CubDebug(error)) + error = CubDebug(error); + if (cudaSuccess != error) { break; } - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -1630,13 +1674,14 @@ struct DispatchRadixSort : SelectedPolicy num_passes, SCAN_BLOCK_THREADS, 0, stream ).doit(DeviceRadixSortExclusiveSumKernel, d_bins); - if (CubDebug(error)) + error = CubDebug(error); + if (cudaSuccess != error) { break; } - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -1657,14 +1702,21 @@ struct DispatchRadixSort : SelectedPolicy for (OffsetT portion = 0; portion < num_portions; ++portion) { PortionOffsetT portion_num_items = - static_cast( - CUB_MIN(num_items - portion * PORTION_SIZE, - static_cast(PORTION_SIZE))); - PortionOffsetT num_blocks = - cub::DivideAndRoundUp(portion_num_items, ONESWEEP_TILE_ITEMS); - if (CubDebug(error = cudaMemsetAsync( - d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), - stream))) break; + static_cast(CUB_MIN(num_items - portion * PORTION_SIZE, + static_cast(PORTION_SIZE))); + + PortionOffsetT num_blocks = cub::DivideAndRoundUp(portion_num_items, + ONESWEEP_TILE_ITEMS); + + error = + CubDebug(cudaMemsetAsync(d_lookback, + 0, + num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), + stream)); + if (cudaSuccess != error) + { + break; + } // log onesweep_kernel configuration #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG @@ -1691,13 +1743,14 @@ struct DispatchRadixSort : SelectedPolicy d_values.Current() + portion * PORTION_SIZE, portion_num_items, current_bit, num_bits, decomposer); - if (CubDebug(error)) + error = CubDebug(error); + if (cudaSuccess != error) { break; } - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -1745,11 +1798,20 @@ struct DispatchRadixSort : SelectedPolicy { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) + { + break; + } // Get SM count int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + error = CubDebug( + cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); + if (cudaSuccess != error) + { + break; + } // Init regular and alternate-digit kernel configurations PassConfig pass_config, alt_pass_config; @@ -1795,11 +1857,18 @@ struct DispatchRadixSort : SelectedPolicy }; // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + break; + } // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) - return cudaSuccess; + { + return cudaSuccess; + } // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size int num_bits = end_bit - begin_bit; @@ -1821,20 +1890,37 @@ struct DispatchRadixSort : SelectedPolicy // Run first pass, consuming from the input's current buffers int current_bit = begin_bit; - if (CubDebug(error = InvokePass( - d_keys.Current(), d_keys_remaining_passes.Current(), - d_values.Current(), d_values_remaining_passes.Current(), - d_spine, spine_length, current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + error = + CubDebug(InvokePass(d_keys.Current(), + d_keys_remaining_passes.Current(), + d_values.Current(), + d_values_remaining_passes.Current(), + d_spine, + spine_length, + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); + if (cudaSuccess != error) + { + break; + } // Run remaining passes while (current_bit < end_bit) { - if (CubDebug(error = InvokePass( - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_spine, spine_length, current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;; + error = CubDebug(InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_spine, + spine_length, + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); + + if (cudaSuccess != error) + { + break; + } // Invert selectors d_keys_remaining_passes.selector ^= 1; @@ -1898,13 +1984,19 @@ struct DispatchRadixSort : SelectedPolicy (long long)stream); #endif cudaError_t error = cudaSuccess; - error = cudaMemcpyAsync(d_keys.Alternate(), d_keys.Current(), num_items * sizeof(KeyT), - cudaMemcpyDefault, stream); - if (CubDebug(error)) + + error = CubDebug(cudaMemcpyAsync(d_keys.Alternate(), + d_keys.Current(), + num_items * sizeof(KeyT), + cudaMemcpyDefault, + stream)); + if (cudaSuccess != error) { return error; } - if (CubDebug(error = detail::DebugSyncStream(stream))) + + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { return error; } @@ -1917,13 +2009,18 @@ struct DispatchRadixSort : SelectedPolicy _CubLog("Invoking async copy of %lld values on stream %lld\n", (long long)num_items, (long long)stream); #endif - error = cudaMemcpyAsync(d_values.Alternate(), d_values.Current(), - num_items * sizeof(ValueT), cudaMemcpyDefault, stream); - if (CubDebug(error)) + error = CubDebug(cudaMemcpyAsync(d_values.Alternate(), + d_values.Current(), + num_items * sizeof(ValueT), + cudaMemcpyDefault, + stream)); + if (cudaSuccess != error) { return error; } - if (CubDebug(error = detail::DebugSyncStream(stream))) + + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { return error; } @@ -2004,7 +2101,12 @@ struct DispatchRadixSort : SelectedPolicy do { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) break; + + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) + { + break; + } // Create dispatch functor DispatchRadixSort dispatch(d_temp_storage, @@ -2020,8 +2122,11 @@ struct DispatchRadixSort : SelectedPolicy decomposer); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) + { + break; + } } while (0); return error; @@ -2220,14 +2325,15 @@ struct DispatchSegmentedRadixSort : SelectedPolicy current_bit, pass_bits, decomposer); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -2290,7 +2396,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy }; // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + break; + } // Return if the caller is simply requesting the size of the storage allocation if (d_temp_storage == NULL) @@ -2320,20 +2430,32 @@ struct DispatchSegmentedRadixSort : SelectedPolicy // Run first pass, consuming from the input's current buffers int current_bit = begin_bit; - if (CubDebug(error = InvokePass( - d_keys.Current(), d_keys_remaining_passes.Current(), - d_values.Current(), d_values_remaining_passes.Current(), - current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + error = + CubDebug(InvokePass(d_keys.Current(), + d_keys_remaining_passes.Current(), + d_values.Current(), + d_values_remaining_passes.Current(), + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); + if (cudaSuccess != error) + { + break; + } // Run remaining passes while (current_bit < end_bit) { - if (CubDebug(error = InvokePass( - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + error = CubDebug(InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config)); + if (cudaSuccess != error) + { + break; + } // Invert selectors and update current bit d_keys_remaining_passes.selector ^= 1; @@ -2409,7 +2531,12 @@ struct DispatchSegmentedRadixSort : SelectedPolicy do { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) break; + + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) + { + break; + } // Create dispatch functor DispatchSegmentedRadixSort dispatch( @@ -2420,8 +2547,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy stream, ptx_version); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) + { + break; + } } while (0); return error; diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index e3402b6d8c9..698ce0552e8 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -676,14 +676,15 @@ struct DispatchReduce : SelectedPolicy .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -726,24 +727,25 @@ struct DispatchReduce : SelectedPolicy { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) + { break; + } // Get SM count int sm_count; - if (CubDebug( - error = cudaDeviceGetAttribute(&sm_count, - cudaDevAttrMultiProcessorCount, - device_ordinal))) + error = + CubDebug(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); + if (cudaSuccess != error) { break; } // Init regular kernel configuration KernelConfig reduce_config; - if (CubDebug( - error = reduce_config.Init( - reduce_kernel))) + error = CubDebug(reduce_config.Init(reduce_kernel)); + if (cudaSuccess != error) { break; } @@ -764,10 +766,9 @@ struct DispatchReduce : SelectedPolicy // Alias the temporary allocations from the single storage blob (or // compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes))) + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) { break; } @@ -810,14 +811,15 @@ struct DispatchReduce : SelectedPolicy reduction_op); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -845,14 +847,15 @@ struct DispatchReduce : SelectedPolicy init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -956,7 +959,8 @@ struct DispatchReduce : SelectedPolicy { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -973,7 +977,8 @@ struct DispatchReduce : SelectedPolicy ptx_version); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } @@ -1199,10 +1204,9 @@ struct DispatchSegmentedReduce : SelectedPolicy // Init kernel configuration KernelConfig segmented_reduce_config; - if (CubDebug( - error = segmented_reduce_config - .Init( - segmented_reduce_kernel))) + error = CubDebug(segmented_reduce_config.Init( + segmented_reduce_kernel)); + if (cudaSuccess != error) { break; } @@ -1234,14 +1238,15 @@ struct DispatchSegmentedReduce : SelectedPolicy init); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -1341,7 +1346,8 @@ struct DispatchSegmentedReduce : SelectedPolicy { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -1360,7 +1366,8 @@ struct DispatchSegmentedReduce : SelectedPolicy ptx_version); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index d56f4cb4d02..a29c1376a41 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -295,7 +295,8 @@ struct DispatchReduceByKey { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) { break; } @@ -306,7 +307,8 @@ struct DispatchReduceByKey // Specify temporary storage allocation requirements size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) + error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + if (cudaSuccess != error) { break; // bytes needed for tile status descriptors } @@ -314,9 +316,10 @@ struct DispatchReduceByKey // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void *allocations[1] = {}; - if (CubDebug( - error = - AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) { break; } @@ -330,7 +333,8 @@ struct DispatchReduceByKey // Construct the tile status interface ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) + error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); + if (cudaSuccess != error) { break; } @@ -353,14 +357,15 @@ struct DispatchReduceByKey .doit(init_kernel, tile_state, num_tiles, d_num_runs_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -373,17 +378,18 @@ struct DispatchReduceByKey // Get SM occupancy for reduce_by_key_kernel int reduce_by_key_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(reduce_by_key_sm_occupancy, - reduce_by_key_kernel, - block_threads))) + error = + CubDebug(MaxSmOccupancy(reduce_by_key_sm_occupancy, reduce_by_key_kernel, block_threads)); + + if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; - if (CubDebug( - error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) + error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); + if (cudaSuccess != error) { break; } @@ -422,14 +428,15 @@ struct DispatchReduceByKey num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -517,7 +524,8 @@ struct DispatchReduceByKey { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -535,7 +543,8 @@ struct DispatchReduceByKey stream); // Dispatch - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 6b978fa95fc..06c6fc90f7b 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -298,8 +298,11 @@ struct DeviceRleDispatch { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) + { break; + } // Number of input tiles int tile_size = block_threads * items_per_thread; @@ -307,7 +310,8 @@ struct DeviceRleDispatch // Specify temporary storage allocation requirements size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) + error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + if (cudaSuccess != error) { break; // bytes needed for tile status descriptors } @@ -315,9 +319,10 @@ struct DeviceRleDispatch // Compute allocation pointers into the single storage blob (or compute the necessary size of // the blob) void *allocations[1] = {}; - if (CubDebug( - error = - AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (error != cudaSuccess) { break; } @@ -330,7 +335,8 @@ struct DeviceRleDispatch // Construct the tile status interface ScanTileStateT tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) + error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); + if (cudaSuccess != error) { break; } @@ -353,14 +359,15 @@ struct DeviceRleDispatch .doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -373,17 +380,18 @@ struct DeviceRleDispatch // Get SM occupancy for device_rle_sweep_kernel int device_rle_kernel_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(device_rle_kernel_sm_occupancy, // out - device_rle_sweep_kernel, - block_threads))) + error = CubDebug(MaxSmOccupancy(device_rle_kernel_sm_occupancy, // out + device_rle_sweep_kernel, + block_threads)); + if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; - if (CubDebug( - error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) + error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); + if (cudaSuccess != error) { break; } @@ -423,14 +431,15 @@ struct DeviceRleDispatch num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -506,7 +515,8 @@ struct DeviceRleDispatch { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -522,7 +532,8 @@ struct DeviceRleDispatch stream); // Dispatch - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index a4bf2eff5fa..f16f1c0fd96 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -365,7 +365,8 @@ struct DispatchScan : SelectedPolicy { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) { break; } @@ -377,8 +378,8 @@ struct DispatchScan : SelectedPolicy // Specify temporary storage allocation requirements size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, - allocation_sizes[0]))) + error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + if (cudaSuccess != error) { break; // bytes needed for tile status descriptors } @@ -386,10 +387,10 @@ struct DispatchScan : SelectedPolicy // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void *allocations[1] = {}; - if (CubDebug(error = AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes))) + + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) { break; } @@ -409,9 +410,8 @@ struct DispatchScan : SelectedPolicy // Construct the tile status interface ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_tiles, - allocations[0], - allocation_sizes[0]))) + error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); + if (cudaSuccess != error) { break; } @@ -436,32 +436,33 @@ struct DispatchScan : SelectedPolicy .doit(init_kernel, tile_state, num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } // Get SM occupancy for scan_kernel int scan_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out - scan_kernel, - Policy::BLOCK_THREADS))) + error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out + scan_kernel, + Policy::BLOCK_THREADS)); + if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, - cudaDevAttrMaxGridDimX, - device_ordinal))) + error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); + if (cudaSuccess != error) { break; } @@ -499,14 +500,15 @@ struct DispatchScan : SelectedPolicy num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -581,7 +583,8 @@ struct DispatchScan : SelectedPolicy { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -598,7 +601,8 @@ struct DispatchScan : SelectedPolicy ptx_version); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 2139721f734..b70e49be272 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -379,7 +379,8 @@ struct DispatchScanByKey : SelectedPolicy { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) { break; } @@ -391,9 +392,8 @@ struct DispatchScanByKey : SelectedPolicy // Specify temporary storage allocation requirements size_t allocation_sizes[2]; - if (CubDebug( - error = ScanByKeyTileStateT::AllocationSize(num_tiles, - allocation_sizes[0]))) + error = CubDebug(ScanByKeyTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + if (cudaSuccess != error) { break; // bytes needed for tile status descriptors } @@ -403,10 +403,10 @@ struct DispatchScanByKey : SelectedPolicy // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void *allocations[2] = {}; - if (CubDebug(error = AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes))) + + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) { break; } @@ -428,9 +428,8 @@ struct DispatchScanByKey : SelectedPolicy // Construct the tile status interface ScanByKeyTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_tiles, - allocations[0], - allocation_sizes[0]))) + error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); + if (cudaSuccess != error) { break; } @@ -459,33 +458,34 @@ struct DispatchScanByKey : SelectedPolicy num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } // Get SM occupancy for scan_kernel int scan_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out - scan_kernel, - Policy::BLOCK_THREADS))) + error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out + scan_kernel, + Policy::BLOCK_THREADS)); + if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, - cudaDevAttrMaxGridDimX, - device_ordinal))) + error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); + if (cudaSuccess != error) { break; } @@ -526,14 +526,15 @@ struct DispatchScanByKey : SelectedPolicy num_items); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -619,7 +620,8 @@ struct DispatchScanByKey : SelectedPolicy { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -638,7 +640,8 @@ struct DispatchScanByKey : SelectedPolicy ptx_version); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index d00e998f165..2eec9290bb9 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -599,14 +599,15 @@ DeviceSegmentedSortContinuation( d_end_offsets); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { return error; } @@ -657,14 +658,15 @@ DeviceSegmentedSortContinuation( d_end_offsets); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { return error; } @@ -734,7 +736,7 @@ DeviceSegmentedSortContinuationKernel( small_segments_indices, 0); // always launching on the main stream (see motivation above) - CubDebug(error); + error = CubDebug(error); } #endif // CUB_RDC_ENABLED @@ -1357,9 +1359,8 @@ struct DispatchSegmentedSort : SelectedPolicy break; } - if (CubDebug( - error = temporary_storage_layout.map_to_buffer(d_temp_storage, - temp_storage_bytes))) + error = CubDebug(temporary_storage_layout.map_to_buffer(d_temp_storage, temp_storage_bytes)); + if (cudaSuccess != error) { break; } @@ -1500,7 +1501,8 @@ struct DispatchSegmentedSort : SelectedPolicy { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -1518,7 +1520,8 @@ struct DispatchSegmentedSort : SelectedPolicy stream); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } @@ -1610,19 +1613,18 @@ private: THRUST_NS_QUALIFIER::make_reverse_iterator( large_and_medium_segments_indices.get() + num_segments); - error = cub::DevicePartition::If( - device_partition_temp_storage.get(), - three_way_partition_temp_storage_bytes, - THRUST_NS_QUALIFIER::counting_iterator(0), - large_and_medium_segments_indices.get(), - small_segments_indices.get(), - medium_indices_iterator, - group_sizes.get(), - num_segments, - large_segments_selector, - small_segments_selector, - stream); - if (CubDebug(error)) + error = CubDebug(cub::DevicePartition::If(device_partition_temp_storage.get(), + three_way_partition_temp_storage_bytes, + THRUST_NS_QUALIFIER::counting_iterator(0), + large_and_medium_segments_indices.get(), + small_segments_indices.get(), + medium_indices_iterator, + group_sizes.get(), + num_segments, + large_segments_selector, + small_segments_selector, + stream)); + if (cudaSuccess != error) { return error; } @@ -1661,14 +1663,15 @@ private: group_sizes.get(), \ large_and_medium_segments_indices.get(), \ small_segments_indices.get()); \ + error = CubDebug(error); \ \ - if (CubDebug(error)) \ + if (cudaSuccess != error) \ { \ return error; \ } \ \ - error = detail::DebugSyncStream(stream); \ - if (CubDebug(error)) \ + error = CubDebug(detail::DebugSyncStream(stream)); \ + if (cudaSuccess != error) \ { \ return error; \ } @@ -1681,18 +1684,20 @@ private: NV_IS_HOST, ( unsigned int h_group_sizes[num_selected_groups]; - - if (CubDebug(error = cudaMemcpyAsync(h_group_sizes, + error = CubDebug(cudaMemcpyAsync(h_group_sizes, group_sizes.get(), num_selected_groups * sizeof(unsigned int), cudaMemcpyDeviceToHost, - stream))) + stream)); + + if (cudaSuccess != error) { return error; } - if (CubDebug(error = SyncStream(stream))) + error = CubDebug(SyncStream(stream)); + if (cudaSuccess != error) { return error; } @@ -1763,14 +1768,15 @@ private: d_end_offsets); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { return error; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { return error; } diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index a2f3536ed2e..56fa86e2ad9 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -338,7 +338,8 @@ struct DispatchSelectIf : SelectedPolicy { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) { break; } @@ -350,14 +351,17 @@ struct DispatchSelectIf : SelectedPolicy size_t allocation_sizes[1]; // bytes needed for tile status descriptors - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) + error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + if (cudaSuccess != error) { break; } // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) void* allocations[1] = {}; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) + + error = CubDebug(AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) { break; } @@ -370,7 +374,8 @@ struct DispatchSelectIf : SelectedPolicy // Construct the tile status interface ScanTileStateT tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) + error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); + if (cudaSuccess != error) { break; } @@ -391,14 +396,15 @@ struct DispatchSelectIf : SelectedPolicy d_num_selected_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -411,7 +417,8 @@ struct DispatchSelectIf : SelectedPolicy // Get max x-dimension of grid int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) + error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); + if (cudaSuccess != error) { break; } @@ -427,9 +434,10 @@ struct DispatchSelectIf : SelectedPolicy { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(range_select_sm_occupancy, // out - select_if_kernel, - block_threads))) + error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out + select_if_kernel, + block_threads)); + if (cudaSuccess != error) { break; } @@ -461,14 +469,15 @@ struct DispatchSelectIf : SelectedPolicy num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index ed22bc86b12..2277956e24c 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -221,7 +221,8 @@ struct DispatchThreeWayPartitionIf { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) { break; } @@ -233,7 +234,8 @@ struct DispatchThreeWayPartitionIf // Specify temporary storage allocation requirements size_t allocation_sizes[1]; // bytes needed for tile status descriptors - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) + error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + if (cudaSuccess != error) { break; } @@ -241,10 +243,10 @@ struct DispatchThreeWayPartitionIf // Compute allocation pointers into the single storage blob (or compute // the necessary size of the blob) void *allocations[1] = {}; - if (CubDebug(error = cub::AliasTemporaries(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes))) + + error = CubDebug( + cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) { break; } @@ -265,7 +267,8 @@ struct DispatchThreeWayPartitionIf // Construct the tile status interface ScanTileStateT tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) + error = CubDebug(tile_status.Init(num_tiles, allocations[0], allocation_sizes[0])); + if (cudaSuccess != error) { break; } @@ -291,22 +294,23 @@ struct DispatchThreeWayPartitionIf d_num_selected_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } // Get max x-dimension of grid int max_dim_x; - if (CubDebug( - error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) + error = CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); + if (cudaSuccess != error) { break; } @@ -322,9 +326,10 @@ struct DispatchThreeWayPartitionIf { // Get SM occupancy for select_if_kernel int range_select_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(range_select_sm_occupancy, // out - three_way_partition_kernel, - block_threads))) + error = CubDebug(MaxSmOccupancy(range_select_sm_occupancy, // out + three_way_partition_kernel, + block_threads)); + if (cudaSuccess != error) { break; } @@ -360,14 +365,15 @@ struct DispatchThreeWayPartitionIf num_tiles); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -419,7 +425,8 @@ struct DispatchThreeWayPartitionIf { // Get PTX version int ptx_version = 0; - if (CubDebug(error = cub::PtxVersion(ptx_version))) + error = CubDebug(cub::PtxVersion(ptx_version)); + if (cudaSuccess != error) { break; } @@ -437,7 +444,8 @@ struct DispatchThreeWayPartitionIf stream); // Dispatch - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) { break; } diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 07aeb59526f..e70d28f2291 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -216,7 +216,11 @@ struct DispatchUniqueByKey : SelectedPolicy { // Get device ordinal int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + error = CubDebug(cudaGetDevice(&device_ordinal)); + if (cudaSuccess != error) + { + break; + } // Number of input tiles int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; @@ -224,10 +228,11 @@ struct DispatchUniqueByKey : SelectedPolicy // Size of virtual shared memory int max_shmem = 0; - if (CubDebug( - error = cudaDeviceGetAttribute(&max_shmem, - cudaDevAttrMaxSharedMemoryPerBlock, - device_ordinal))) + + error = CubDebug(cudaDeviceGetAttribute(&max_shmem, + cudaDevAttrMaxSharedMemoryPerBlock, + device_ordinal)); + if (cudaSuccess != error) { break; } @@ -235,11 +240,24 @@ struct DispatchUniqueByKey : SelectedPolicy // Specify temporary storage allocation requirements size_t allocation_sizes[2] = {0, vshmem_size}; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Bytes needed for tile status descriptors + error = CubDebug(ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0])); + if (cudaSuccess != error) + { + break; + } // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) void *allocations[2] = {NULL, NULL}; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + error = CubDebug( + AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)); + if (cudaSuccess != error) + { + break; + } + if (d_temp_storage == NULL) { // Return if the caller is simply requesting the size of the storage allocation @@ -248,7 +266,11 @@ struct DispatchUniqueByKey : SelectedPolicy // Construct the tile status interface ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + error = CubDebug(tile_state.Init(num_tiles, allocations[0], allocation_sizes[0])); + if (cudaSuccess != error) + { + break; + } // Log init_kernel configuration num_tiles = CUB_MAX(1, num_tiles); @@ -264,11 +286,15 @@ struct DispatchUniqueByKey : SelectedPolicy ).doit(init_kernel, tile_state, num_tiles, d_num_selected_out); // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; + error = CubDebug(cudaPeekAtLastError()); + if (cudaSuccess != error) + { + break; + } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -278,7 +304,12 @@ struct DispatchUniqueByKey : SelectedPolicy // Get max x-dimension of grid int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; + error = + CubDebug(cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal)); + if (cudaSuccess != error) + { + break; + } // Get grid size for scanning tiles dim3 scan_grid_size; @@ -291,9 +322,10 @@ struct DispatchUniqueByKey : SelectedPolicy { // Get SM occupancy for unique_by_key_kernel int scan_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out - scan_kernel, - Policy::BLOCK_THREADS))) + error = CubDebug(MaxSmOccupancy(scan_sm_occupancy, // out + scan_kernel, + Policy::BLOCK_THREADS)); + if (cudaSuccess != error) { break; } @@ -325,14 +357,15 @@ struct DispatchUniqueByKey : SelectedPolicy num_tiles); // Check for failure to launch - if (CubDebug(error)) + error = CubDebug(error); + if (cudaSuccess != error) { break; } // Sync the stream if specified to flush runtime errors - error = detail::DebugSyncStream(stream); - if (CubDebug(error)) + error = CubDebug(detail::DebugSyncStream(stream)); + if (cudaSuccess != error) { break; } @@ -388,7 +421,11 @@ struct DispatchUniqueByKey : SelectedPolicy { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) break; + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) + { + break; + } // Create dispatch functor DispatchUniqueByKey dispatch( @@ -404,7 +441,11 @@ struct DispatchUniqueByKey : SelectedPolicy stream); // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + error = CubDebug(MaxPolicyT::Invoke(ptx_version, dispatch)); + if (cudaSuccess != error) + { + break; + } } while (0); diff --git a/cub/cub/grid/grid_barrier.cuh b/cub/cub/grid/grid_barrier.cuh index ca946f3cc84..063a2c395cd 100644 --- a/cub/cub/grid/grid_barrier.cuh +++ b/cub/cub/grid/grid_barrier.cuh @@ -154,7 +154,7 @@ public: cudaError_t retval = cudaSuccess; if (d_sync) { - CubDebug(retval = cudaFree(d_sync)); + retval = CubDebug(cudaFree(d_sync)); d_sync = NULL; } sync_bytes = 0; @@ -184,14 +184,27 @@ public: { if (d_sync) { - if (CubDebug(retval = cudaFree(d_sync))) break; + retval = CubDebug(cudaFree(d_sync)); + if (cudaSuccess != retval) + { + break; + } } sync_bytes = new_sync_bytes; // Allocate and initialize to zero - if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; - if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; + retval = CubDebug(cudaMalloc((void**) &d_sync, sync_bytes)); + if (cudaSuccess != retval) + { + break; + } + + retval = CubDebug(cudaMemset(d_sync, 0, new_sync_bytes)); + if (cudaSuccess != retval) + { + break; + } } } while (0); diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh index bb6d6a41784..20c0a2ad80a 100644 --- a/cub/cub/util_allocator.cuh +++ b/cub/cub/util_allocator.cuh @@ -361,7 +361,12 @@ struct CachingDeviceAllocator if (device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + error = CubDebug(cudaGetDevice(&entrypoint_device)); + if (cudaSuccess != error) + { + return error; + } + device = entrypoint_device; } @@ -447,12 +452,22 @@ struct CachingDeviceAllocator // Set runtime's current device to specified device (entrypoint may not be set) if (device != entrypoint_device) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; - if (CubDebug(error = cudaSetDevice(device))) return error; + error = CubDebug(cudaGetDevice(&entrypoint_device)); + if (cudaSuccess != error) + { + return error; + } + + error = CubDebug(cudaSetDevice(device)); + if (cudaSuccess != error) + { + return error; + } } // Attempt to allocate - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) + error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes)); + if (error == cudaErrorMemoryAllocation) { // The allocation attempt failed: free all cached blocks on device and retry if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", @@ -475,8 +490,17 @@ struct CachingDeviceAllocator // on the current device // Free device memory and destroy stream event. - if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; + error = CubDebug(cudaFree(block_itr->d_ptr)); + if (cudaSuccess != error) + { + break; + } + + error = CubDebug(cudaEventDestroy(block_itr->ready_event)); + if (cudaSuccess != error) + { + break; + } // Reduce balance and erase entry cached_bytes[device].free -= block_itr->bytes; @@ -494,12 +518,21 @@ struct CachingDeviceAllocator if (error) return error; // Try to allocate again - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; + error = CubDebug(cudaMalloc(&search_key.d_ptr, search_key.bytes)); + if (cudaSuccess != error) + { + return error; + } } // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + error = + CubDebug(cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)); + + if (cudaSuccess != error) + { return error; + } // Insert into live blocks mutex.lock(); @@ -513,7 +546,11 @@ struct CachingDeviceAllocator // Attempt to revert back to previous device if necessary if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + error = CubDebug(cudaSetDevice(entrypoint_device)); + if (cudaSuccess != error) + { + return error; + } } } @@ -559,8 +596,11 @@ struct CachingDeviceAllocator if (device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + error = CubDebug(cudaGetDevice(&entrypoint_device)); + if (cudaSuccess != error) + { return error; + } device = entrypoint_device; } @@ -598,21 +638,43 @@ struct CachingDeviceAllocator // First set to specified device (entrypoint may not be set) if (device != entrypoint_device) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; - if (CubDebug(error = cudaSetDevice(device))) return error; + error = CubDebug(cudaGetDevice(&entrypoint_device)); + if (cudaSuccess != error) + { + return error; + } + + error = CubDebug(cudaSetDevice(device)); + if (cudaSuccess != error) + { + return error; + } } if (recached) { // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; + error = CubDebug(cudaEventRecord(search_key.ready_event, search_key.associated_stream)); + if (cudaSuccess != error) + { + return error; + } } if (!recached) { // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFree(d_ptr))) return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; + error = CubDebug(cudaFree(d_ptr)); + if (cudaSuccess != error) + { + return error; + } + + error = CubDebug(cudaEventDestroy(search_key.ready_event)); + if (cudaSuccess != error) + { + return error; + } if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); @@ -621,7 +683,11 @@ struct CachingDeviceAllocator // Reset device if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + error = CubDebug(cudaSetDevice(entrypoint_device)); + if (cudaSuccess != error) + { + return error; + } } return error; @@ -661,19 +727,36 @@ struct CachingDeviceAllocator // Get entry-point device ordinal if necessary if (entrypoint_device == INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + error = CubDebug(cudaGetDevice(&entrypoint_device)); + if (cudaSuccess != error) + { + break; + } } // Set current device ordinal if necessary if (begin->device != current_device) { - if (CubDebug(error = cudaSetDevice(begin->device))) break; + error = CubDebug(cudaSetDevice(begin->device)); + if (cudaSuccess != error) + { + break; + } current_device = begin->device; } // Free device memory - if (CubDebug(error = cudaFree(begin->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; + error = CubDebug(cudaFree(begin->d_ptr)); + if (cudaSuccess != error) + { + break; + } + + error = CubDebug(cudaEventDestroy(begin->ready_event)); + if (cudaSuccess != error) + { + break; + } // Reduce balance and erase entry const size_t block_bytes = begin->bytes; @@ -690,7 +773,11 @@ struct CachingDeviceAllocator // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + error = CubDebug(cudaSetDevice(entrypoint_device)); + if (cudaSuccess != error) + { + return error; + } } return error; diff --git a/cub/cub/util_debug.cuh b/cub/cub/util_debug.cuh index 80b2bd336ed..5023524dd6e 100644 --- a/cub/cub/util_debug.cuh +++ b/cub/cub/util_debug.cuh @@ -190,18 +190,25 @@ cudaError_t Debug(cudaError_t error, const char *filename, int line) #ifndef CUB_RDC_ENABLED #define CUB_TEMP_DEVICE_CODE #else - #define CUB_TEMP_DEVICE_CODE cudaGetLastError() + #define CUB_TEMP_DEVICE_CODE last_error = cudaGetLastError() #endif + cudaError_t last_error = cudaSuccess; + NV_IF_TARGET( NV_IS_HOST, - (cudaGetLastError();), + (last_error = cudaGetLastError();), (CUB_TEMP_DEVICE_CODE;) ); #undef CUB_TEMP_DEVICE_CODE // clang-format on + if (error == cudaSuccess && last_error != cudaSuccess) + { + error = last_error; + } + #ifdef CUB_STDERR if (error) { diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index cc371826a91..d8caaedbb43 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -305,9 +305,8 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version) ( cudaFuncAttributes empty_kernel_attrs; - result = cudaFuncGetAttributes(&empty_kernel_attrs, - reinterpret_cast(empty_kernel)); - CubDebug(result); + result = CubDebug(cudaFuncGetAttributes(&empty_kernel_attrs, + reinterpret_cast(empty_kernel))); ptx_version = empty_kernel_attrs.ptxVersion * 10; ), @@ -412,8 +411,17 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d do { int major = 0, minor = 0; - if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break; - if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break; + error = CubDebug(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); + if (cudaSuccess != error) + { + break; + } + + error = CubDebug(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); + if (cudaSuccess != error) + { + break; + } sm_version = major * 100 + minor * 10; } while (0); @@ -535,10 +543,15 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva) has_uva = false; cudaError_t error = cudaSuccess; int device = -1; - if (CubDebug(error = cudaGetDevice(&device)) != cudaSuccess) return error; + error = CubDebug(cudaGetDevice(&device)); + if (cudaSuccess != error) + { + return error; + } + int uva = 0; - if (CubDebug(error = cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device)) - != cudaSuccess) + error = CubDebug(cudaDeviceGetAttribute(&uva, cudaDevAttrUnifiedAddressing, device)); + if (cudaSuccess != error) { return error; } diff --git a/cub/docs/developer_overview.rst b/cub/docs/developer_overview.rst index 24adfa42109..c60886d0d0d 100644 --- a/cub/docs/developer_overview.rst +++ b/cub/docs/developer_overview.rst @@ -523,7 +523,11 @@ The dispatch entry point is typically represented by a static member function th do { // Get PTX version int ptx_version = 0; - if (CubDebug(error = PtxVersion(ptx_version))) break; + error = CubDebug(PtxVersion(ptx_version)); + if (cudaSuccess != error) + { + break; + } // Create dispatch functor DispatchSegmentedReduce dispatch( diff --git a/cub/test/catch2_test_debug.cu b/cub/test/catch2_test_debug.cu new file mode 100644 index 00000000000..12771934db4 --- /dev/null +++ b/cub/test/catch2_test_debug.cu @@ -0,0 +1,37 @@ +#include +#include + +#include "catch2_test_helper.h" + +TEST_CASE("CubDebug returns input error", "[debug][utils]") +{ + REQUIRE( CubDebug(cudaSuccess) == cudaSuccess ); + REQUIRE( CubDebug(cudaErrorInvalidConfiguration) == cudaErrorInvalidConfiguration ); +} + +TEST_CASE("CubDebug returns new errors", "[debug][utils]") +{ + cub::EmptyKernel<<<0, 0>>>(); + cudaError error = cudaPeekAtLastError(); + + REQUIRE( error != cudaSuccess ); + REQUIRE( CubDebug(cudaSuccess) != cudaSuccess ); +} + +TEST_CASE("CubDebug prefers input errors", "[debug][utils]") +{ + cub::EmptyKernel<<<0, 0>>>(); + cudaError error = cudaPeekAtLastError(); + + REQUIRE( error != cudaSuccess ); + REQUIRE( CubDebug(cudaErrorMemoryAllocation) != cudaSuccess ); +} + +TEST_CASE("CubDebug resets last error", "[debug][utils]") +{ + cub::EmptyKernel<<<0, 0>>>(); + cudaError error = cudaPeekAtLastError(); + + REQUIRE( CubDebug(cudaSuccess) != cudaSuccess ); + REQUIRE( CubDebug(cudaSuccess) == cudaSuccess ); +}