Skip to content

Commit

Permalink
Develop stream 2023-10-27 (#309)
Browse files Browse the repository at this point in the history
* Accumulator types changed for reduce and test_hipcub_device_reduce fixed for new thread operators

* Add thread operators test

* Bump CUB and Thrust versions to 2.1.0

* change how we use the rocprim::host_warp_size

* update changelog

* move host_warp_size_wrapper out of the HIPCUB_HOST_WARP_THREADS macro

* update changelog to be clearer

* add changes related to __int128_t support

* finish int128 support
add tests for block and device_radix_sort
add assert_bit_eq for (u)int128 vectors

* Test large indices for DeviceReduce

* Fix clang format

* Include FetchContent in new ROCmCMakeBuildToolsDependency cmake file

* Use _ENABLE_EXTENDED_ALIGNED_STORAGE for windows build in rmake.py

* Update CHANGELOG to ROCm 6.1

---------

Co-authored-by: Bence Parajdi <[email protected]>
  • Loading branch information
Beanavil and parbenc authored Nov 14, 2023
1 parent f829792 commit f13da05
Show file tree
Hide file tree
Showing 31 changed files with 2,220 additions and 573 deletions.
2 changes: 0 additions & 2 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ BraceWrapping:
AfterNamespace: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
AfterExternBlock: false
BeforeCatch: true
BeforeElse: true
Expand Down
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

See README.md on how to build the hipCUB documentation using Doxygen.

## (Unreleased) hipCUB-2.13.1 for ROCm 6.1.0
### Changed
- CUB backend references CUB and Thrust version 2.1.0.
- Updated `HIPCUB_HOST_WARP_THREADS` macro definition to match `host_warp_size` changes from rocPRIM 3.0.
- Implemented `__int128_t` and `__uint128_t` support for radix_sort.
### Fixed
- Fixed build issues with `rmake.py` on Windows when using VS 2017 15.8 or later due to a breaking fix with extended aligned storage.

## (Unreleased) hipCUB-2.13.1 for ROCm 5.7.0
### Changed
- CUB backend references CUB and Thrust version 2.0.1.
Expand Down
11 changes: 7 additions & 4 deletions benchmark/benchmark_device_reduce.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -119,15 +119,15 @@ template<typename T>
struct Benchmark<T, hipcub::Sum> {
static void run(benchmark::State& state, size_t size, const hipStream_t stream)
{
run_benchmark<T, T>(state, size, stream, hipcub::DeviceReduce::Sum<T*, T*>);
run_benchmark<T, T>(state, size, stream, hipcub::DeviceReduce::Sum<T*, T*, int>);
}
};

template<typename T>
struct Benchmark<T, hipcub::Min> {
static void run(benchmark::State& state, size_t size, const hipStream_t stream)
{
run_benchmark<T, T>(state, size, stream, hipcub::DeviceReduce::Min<T*, T*>);
run_benchmark<T, T>(state, size, stream, hipcub::DeviceReduce::Min<T*, T*, int>);
}
};

Expand All @@ -139,7 +139,10 @@ struct Benchmark<T, hipcub::ArgMin> {

static void run(benchmark::State& state, size_t size, const hipStream_t stream)
{
run_benchmark<T, KeyValue>(state, size, stream, hipcub::DeviceReduce::ArgMin<T*, KeyValue*>);
run_benchmark<T, KeyValue>(state,
size,
stream,
hipcub::DeviceReduce::ArgMin<T*, KeyValue*, int>);
}
};

Expand Down
24 changes: 12 additions & 12 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -147,54 +147,54 @@ if(HIP_COMPILER STREQUAL "nvcc")

if(NOT DEFINED CUB_INCLUDE_DIR)
file(
DOWNLOAD https://github.com/NVIDIA/cub/archive/2.0.1.zip
${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip
DOWNLOAD https://github.com/NVIDIA/cub/archive/2.1.0.zip
${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip
STATUS cub_download_status LOG cub_download_log
)
list(GET cub_download_status 0 cub_download_error_code)
if(cub_download_error_code)
message(FATAL_ERROR "Error: downloading "
"https://github.com/NVIDIA/cub/archive/2.0.1.zip failed "
"https://github.com/NVIDIA/cub/archive/2.1.0.zip failed "
"error_code: ${cub_download_error_code} "
"log: ${cub_download_log} "
)
endif()

execute_process(
COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip
COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
RESULT_VARIABLE cub_unpack_error_code
)
if(cub_unpack_error_code)
message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip failed")
message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0.zip failed")
endif()
set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1/ CACHE PATH "")
set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.1.0/ CACHE PATH "")
endif()

if(NOT DEFINED THRUST_INCLUDE_DIR)
file(
DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.0.1.zip
${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip
DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.1.0.zip
${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip
STATUS thrust_download_status LOG thrust_download_log
)
list(GET thrust_download_status 0 thrust_download_error_code)
if(thrust_download_error_code)
message(FATAL_ERROR "Error: downloading "
"https://github.com/NVIDIA/thrust/archive/2.0.1.zip failed "
"https://github.com/NVIDIA/thrust/archive/2.1.0.zip failed "
"error_code: ${thrust_download_error_code} "
"log: ${thrust_download_log} "
)
endif()

execute_process(
COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip
COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
RESULT_VARIABLE thrust_unpack_error_code
)
if(thrust_unpack_error_code)
message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip failed")
message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0.zip failed")
endif()
set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1/ CACHE PATH "")
set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.1.0/ CACHE PATH "")
endif()
else()
# rocPRIM (only for ROCm platform)
Expand Down
23 changes: 23 additions & 0 deletions hipcub/include/hipcub/backend/cub/device/device_merge_sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,29 @@ struct DeviceMergeSort
compare_op,
stream));
}

template<typename KeyInputIteratorT,
typename KeyIteratorT,
typename OffsetT,
typename CompareOpT>
HIPCUB_RUNTIME_FUNCTION static hipError_t StableSortKeysCopy(void* d_temp_storage,
std::size_t& temp_storage_bytes,
KeyInputIteratorT d_input_keys,
KeyIteratorT d_output_keys,
OffsetT num_items,
CompareOpT compare_op,
hipStream_t stream = 0,
bool debug_synchronous = false)
{
(void)debug_synchronous;
return hipCUDAErrorTohipError(::cub::DeviceMergeSort::StableSortKeysCopy(d_temp_storage,
temp_storage_bytes,
d_input_keys,
d_output_keys,
num_items,
compare_op,
stream));
}
};

END_HIPCUB_NAMESPACE
Expand Down
169 changes: 73 additions & 96 deletions hipcub/include/hipcub/backend/cub/device/device_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,20 @@ BEGIN_HIPCUB_NAMESPACE
class DeviceReduce
{
public:
template <
typename InputIteratorT,
typename OutputIteratorT,
typename ReduceOpT,
typename T
>
HIPCUB_RUNTIME_FUNCTION static
hipError_t Reduce(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
ReduceOpT reduction_op,
T init,
hipStream_t stream = 0,
bool debug_synchronous = false)
template<typename InputIteratorT,
typename OutputIteratorT,
typename ReduceOpT,
typename T,
typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static hipError_t Reduce(void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
ReduceOpT reduction_op,
T init,
hipStream_t stream = 0,
bool debug_synchronous = false)
{
(void)debug_synchronous;
return hipCUDAErrorTohipError(::cub::DeviceReduce::Reduce(d_temp_storage,
Expand All @@ -67,18 +65,14 @@ class DeviceReduce
stream));
}

template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
hipError_t Sum(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
template<typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static hipError_t Sum(void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
{
(void)debug_synchronous;
return hipCUDAErrorTohipError(::cub::DeviceReduce::Sum(d_temp_storage,
Expand All @@ -89,18 +83,14 @@ class DeviceReduce
stream));
}

template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
hipError_t Min(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
template<typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static hipError_t Min(void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
{
(void)debug_synchronous;
return hipCUDAErrorTohipError(::cub::DeviceReduce::Min(d_temp_storage,
Expand All @@ -111,18 +101,14 @@ class DeviceReduce
stream));
}

template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
hipError_t ArgMin(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
template<typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMin(void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
{
(void)debug_synchronous;
return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMin(d_temp_storage,
Expand All @@ -133,18 +119,14 @@ class DeviceReduce
stream));
}

template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
hipError_t Max(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
template<typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static hipError_t Max(void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
{
(void)debug_synchronous;
return hipCUDAErrorTohipError(::cub::DeviceReduce::Max(d_temp_storage,
Expand All @@ -155,18 +137,14 @@ class DeviceReduce
stream));
}

template <
typename InputIteratorT,
typename OutputIteratorT
>
HIPCUB_RUNTIME_FUNCTION static
hipError_t ArgMax(void *d_temp_storage,
size_t &temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
int num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
template<typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static hipError_t ArgMax(void* d_temp_storage,
size_t& temp_storage_bytes,
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
{
(void)debug_synchronous;
return hipCUDAErrorTohipError(::cub::DeviceReduce::ArgMax(d_temp_storage,
Expand All @@ -177,26 +155,25 @@ class DeviceReduce
stream));
}

template<
typename KeysInputIteratorT,
typename UniqueOutputIteratorT,
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename ReductionOpT
>
HIPCUB_RUNTIME_FUNCTION static
hipError_t ReduceByKey(void * d_temp_storage,
size_t& temp_storage_bytes,
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
ReductionOpT reduction_op,
int num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
template<typename KeysInputIteratorT,
typename UniqueOutputIteratorT,
typename ValuesInputIteratorT,
typename AggregatesOutputIteratorT,
typename NumRunsOutputIteratorT,
typename ReductionOpT,
typename NumItemsT>
HIPCUB_RUNTIME_FUNCTION static hipError_t
ReduceByKey(void* d_temp_storage,
size_t& temp_storage_bytes,
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
ReductionOpT reduction_op,
NumItemsT num_items,
hipStream_t stream = 0,
bool debug_synchronous = false)
{
(void)debug_synchronous;
return hipCUDAErrorTohipError(::cub::DeviceReduce::ReduceByKey(d_temp_storage,
Expand Down
Loading

0 comments on commit f13da05

Please sign in to comment.