diff --git a/CHANGELOG.md b/CHANGELOG.md index fc021075..ab6a1b62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,10 +2,11 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/projects/hipCUB/en/latest/](https://rocm.docs.amd.com/projects/hipCUB/en/latest/). -## (Unreleased) hipCUB-x.x.x for ROCm 6.4.0 +## hipCUB-3.4.0 for ROCm 6.4.0 ### Added * Added regression tests to `rtest.py`. These tests recreate scenarios that have caused hardware problems in past emulation environments. Use `python rtest.py [--emulation|-e|--test|-t]=regression` to run these tests. +* Added extended tests to `rtest.py`. These tests are extra tests that did not fit the criteria of smoke and regression tests. These tests will take much longer to run relative to smoke and regression tests. Use `python rtest.py [--emulation|-e|--test|-t]=extended` to run these tests. * Added `ForEach`, `ForEachN`, `ForEachCopy`, `ForEachCopyN` and `Bulk` functions to have parity with CUB. * Added the `hipcub::CubVector` type for CUB parity. * Added `--emulation` option for `rtest.py` @@ -18,25 +19,19 @@ Full documentation for hipCUB is available at [https://rocm.docs.amd.com/project * The NVIDIA backend now requires CUB, Thrust and libcu++ 2.5.0. If it is not found it will be downloaded from the NVIDIA CCCL repository. * Changed the C++ version from 14 to 17. C++14 will be deprecated in the next major release. -## hipCUB-3.3.0 for ROCm 6.3.0 - -### Fixed -* Not all headers in hipCUB included `config.hpp` which could have resulted in build errors. +## hipCUB 3.3.0 for ROCm 6.3.0 ### Added + * Support for large indices in `hipcub::DeviceSegmentedReduce::*` has been added, with the exception of `DeviceSegmentedReduce::Arg*`. Although rocPRIM's backend provides support for all reduce variants, CUB does not support large indices in `DeviceSegmentedReduce::Arg*`. For this reason, large index support is not available for `hipcub::DeviceSegmentedReduce::Arg*`. -* Add -t smoke option in rtest.py. It will run a subset of tests such that the total test time is in 5 minutes. Use python3 ./rtest.py --test smoke or python3 ./rtest.py -t smoke to execute smoke test. -* Add inplace overloads of `DeviceScan` functions. -* Add inplace overloads of `DeviceSelect::Flagged` and `DeviceSelect::If`. -* Add `DeviceReduce::TransformReduce`. -* Add `DeviceSelect::UniqueByKey` overload with `equality_op`. -* Add support for large indices in `DeviceSelect::UniqueByKey`. ### Changed -* The NVIDIA backend now requires CUB, Thrust and libcu++ 2.4.0. If it is not found it will be downloaded from the NVIDIA CCCL repository. -### Resolved issues +* Changed the default value of `rmake.py -a` to `default_gpus`. This is equivalent to `gfx906:xnack-,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201`. +* The NVIDIA backend now requires CUB, Thrust, and libcu++ 2.3.2. +### Resolved issues +* Fixed an issue in `rmake.py` where the list storing cmake options would contain individual characters instead of a full string of options. * Fixed an issue where `config.hpp` was not included in all hipCUB headers, resulting in build errors. ## hipCUB-3.2.0 for ROCm 6.2.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 424d3efc..b050df24 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,11 +91,11 @@ if(NOT (CMAKE_CXX_COMPILER MATCHES ".*nvcc$" OR "${CMAKE_CXX_COMPILER_ID}" STREQ if(BUILD_ADDRESS_SANITIZER) # ASAN builds require xnack rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+" + TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+" ) else() rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" + TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" ) endif() set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) @@ -115,7 +115,7 @@ if(BUILD_ADDRESS_SANITIZER) endif() # Setup VERSION -set(VERSION_STRING "3.3.0") +set(VERSION_STRING "3.4.0") rocm_setup_version(VERSION ${VERSION_STRING}) # Print configuration summary diff --git a/rmake.py b/rmake.py index 53bcf269..4d828528 100644 --- a/rmake.py +++ b/rmake.py @@ -20,6 +20,8 @@ def parse_args(): parser = argparse.ArgumentParser(description=""" Checks build arguments """) + default_gpus = 'gfx906:xnack-,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201' + parser.add_argument('-g', '--debug', required=False, default=False, action='store_true', help='Generate Debug build (default: False)') parser.add_argument( '--build_dir', type=str, required=False, default="build", @@ -35,7 +37,7 @@ def parse_args(): help='Install after build (default: False)') parser.add_argument( '--cmake-darg', required=False, dest='cmake_dargs', action='append', default=[], help='List of additional cmake defines for builds (e.g. CMAKE_CXX_COMPILER_LAUNCHER=ccache)') - parser.add_argument('-a', '--architecture', dest='gpu_architecture', required=False, default="gfx906;gfx1030;gfx1100;gfx1101;gfx1102", #:sramecc+:xnack-" ) #gfx1030" ) #gfx906" ) # gfx1030" ) + parser.add_argument('-a', '--architecture', dest='gpu_architecture', required=False, default=default_gpus, #:sramecc+:xnack-" ) #gfx1030" ) #gfx906" ) # gfx1030" ) help='Set GPU architectures, e.g. all, gfx000, gfx803, gfx906:xnack-;gfx1030;gfx1100 (optional, default: all)') parser.add_argument('-v', '--verbose', required=False, default=False, action='store_true', help='Verbose build (default: False)') @@ -115,7 +117,7 @@ def config_cmd(): else: cmake_executable = "cmake" toolchain = "toolchain-linux.cmake" - cmake_platform_opts = f"-DROCM_DIR:PATH={rocm_path} -DCPACK_PACKAGING_INSTALL_PREFIX={rocm_path}" + cmake_platform_opts = [f"-DROCM_DIR:PATH={rocm_path}", f"-DCPACK_PACKAGING_INSTALL_PREFIX={rocm_path}"] tools = f"-DCMAKE_TOOLCHAIN_FILE={toolchain}" cmake_options.append( tools )